@@ -302,37 +302,16 @@ def test_blob_image_normalize_to_bq(images_mm_df: bpd.DataFrame, bq_connection:
302
302
303
303
304
304
@pytest .mark .parametrize (
305
- "verbose, expected " ,
305
+ "verbose" ,
306
306
[
307
- (
308
- True ,
309
- pd .Series (
310
- [
311
- {"status" : "File has not been decrypted" , "content" : "" },
312
- {
313
- "status" : "" ,
314
- "content" : "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
315
- },
316
- ]
317
- ),
318
- ),
319
- (
320
- False ,
321
- pd .Series (
322
- [
323
- "" ,
324
- "Sample PDF This is a testing file. Some dummy messages are used for testing purposes. " ,
325
- ],
326
- name = "pdf" ,
327
- ),
328
- ),
307
+ (True ),
308
+ (False ),
329
309
],
330
310
)
331
311
def test_blob_pdf_extract (
332
312
pdf_mm_df : bpd .DataFrame ,
333
313
verbose : bool ,
334
314
bq_connection : str ,
335
- expected : pd .Series ,
336
315
):
337
316
actual = (
338
317
pdf_mm_df ["pdf" ]
@@ -341,49 +320,44 @@ def test_blob_pdf_extract(
341
320
.to_pandas ()
342
321
)
343
322
344
- pd .testing .assert_series_equal (
345
- actual ,
346
- expected ,
347
- check_dtype = False ,
348
- check_index = False ,
323
+ # check relative length
324
+ expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
325
+ expected_len = len (expected_text )
326
+
327
+ actual_text = ""
328
+ if verbose :
329
+ # The first entry is for a file that doesn't exist, so we check the second one
330
+ successful_results = actual [actual .apply (lambda x : x ["status" ] == "" )]
331
+ actual_text = successful_results .apply (lambda x : x ["content" ]).iloc [0 ]
332
+ else :
333
+ actual_text = actual [actual != "" ].iloc [0 ]
334
+ actual_len = len (actual_text )
335
+
336
+ relative_length_tolerance = 0.25
337
+ min_acceptable_len = expected_len * (1 - relative_length_tolerance )
338
+ max_acceptable_len = expected_len * (1 + relative_length_tolerance )
339
+ assert min_acceptable_len <= actual_len <= max_acceptable_len , (
340
+ f"Item (verbose={ verbose } ): Extracted text length { actual_len } is outside the acceptable range "
341
+ f"[{ min_acceptable_len :.0f} , { max_acceptable_len :.0f} ]. "
342
+ f"Expected reference length was { expected_len } . "
349
343
)
350
344
345
+ # check for major keywords
346
+ major_keywords = ["Sample" , "PDF" , "testing" , "dummy" , "messages" ]
347
+ for keyword in major_keywords :
348
+ assert (
349
+ keyword .lower () in actual_text .lower ()
350
+ ), f"Item (verbose={ verbose } ): Expected keyword '{ keyword } ' not found in extracted text. "
351
+
351
352
352
353
@pytest .mark .parametrize (
353
- "verbose, expected " ,
354
+ "verbose" ,
354
355
[
355
- (
356
- True ,
357
- pd .Series (
358
- [
359
- {"status" : "File has not been decrypted" , "content" : []},
360
- {
361
- "status" : "" ,
362
- "content" : [
363
- "Sample PDF This is a testing file. Some " ,
364
- "dummy messages are used for testing " ,
365
- "purposes. " ,
366
- ],
367
- },
368
- ]
369
- ),
370
- ),
371
- (
372
- False ,
373
- pd .Series (
374
- [
375
- pd .NA ,
376
- "Sample PDF This is a testing file. Some " ,
377
- "dummy messages are used for testing " ,
378
- "purposes. " ,
379
- ],
380
- ),
381
- ),
356
+ (True ),
357
+ (False ),
382
358
],
383
359
)
384
- def test_blob_pdf_chunk (
385
- pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str , expected : pd .Series
386
- ):
360
+ def test_blob_pdf_chunk (pdf_mm_df : bpd .DataFrame , verbose : bool , bq_connection : str ):
387
361
actual = (
388
362
pdf_mm_df ["pdf" ]
389
363
.blob .pdf_chunk (
@@ -397,13 +371,36 @@ def test_blob_pdf_chunk(
397
371
.to_pandas ()
398
372
)
399
373
400
- pd .testing .assert_series_equal (
401
- actual ,
402
- expected ,
403
- check_dtype = False ,
404
- check_index = False ,
374
+ # check relative length
375
+ expected_text = "Sample PDF This is a testing file. Some dummy messages are used for testing purposes."
376
+ expected_len = len (expected_text )
377
+
378
+ actual_text = ""
379
+ if verbose :
380
+ # The first entry is for a file that doesn't exist, so we check the second one
381
+ successful_results = actual [actual .apply (lambda x : x ["status" ] == "" )]
382
+ actual_text = "" .join (successful_results .apply (lambda x : x ["content" ]).iloc [0 ])
383
+ else :
384
+ # First entry is NA
385
+ actual_text = "" .join (actual .dropna ())
386
+ actual_len = len (actual_text )
387
+
388
+ relative_length_tolerance = 0.25
389
+ min_acceptable_len = expected_len * (1 - relative_length_tolerance )
390
+ max_acceptable_len = expected_len * (1 + relative_length_tolerance )
391
+ assert min_acceptable_len <= actual_len <= max_acceptable_len , (
392
+ f"Item (verbose={ verbose } ): Extracted text length { actual_len } is outside the acceptable range "
393
+ f"[{ min_acceptable_len :.0f} , { max_acceptable_len :.0f} ]. "
394
+ f"Expected reference length was { expected_len } . "
405
395
)
406
396
397
+ # check for major keywords
398
+ major_keywords = ["Sample" , "PDF" , "testing" , "dummy" , "messages" ]
399
+ for keyword in major_keywords :
400
+ assert (
401
+ keyword .lower () in actual_text .lower ()
402
+ ), f"Item (verbose={ verbose } ): Expected keyword '{ keyword } ' not found in extracted text. "
403
+
407
404
408
405
@pytest .mark .parametrize (
409
406
"model_name, verbose" ,
0 commit comments