@@ -50,23 +50,75 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
5050
5151 exp_basename = f"{ Path (input_file ).stem } .{ mode } .{ output_mode } .txt"
5252 exp_file = os .path .join (data_dir , "expected" , exp_basename )
53- with open (exp_file , encoding = "utf-8" ) as f :
53+ # verify extracted text
54+ assert_extracted_text (exp_file , whisper_result , mode , output_mode )
55+
56+
57+ @pytest .mark .parametrize (
58+ "output_mode, mode, url, input_file, page_count" ,
59+ [
60+ ("layout_preserving" , "native_text" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
61+ "credit_card.pdf" , 7 ),
62+ ("layout_preserving" , "low_cost" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
63+ "credit_card.pdf" , 7 ),
64+ ("layout_preserving" , "high_quality" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf" ,
65+ "restaurant_invoice_photo.pdf" , 1 ),
66+ ("layout_preserving" , "form" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf" ,
67+ "handwritten-form.pdf" , 1 ),
68+ ]
69+ )
70+ def test_whisper_v2_url_in_post (client_v2 , data_dir , output_mode , mode , url , input_file , page_count ):
71+ usage_before = client_v2 .get_usage_info ()
72+ whisper_result = client_v2 .whisper (
73+ mode = mode , output_mode = output_mode , url = url , wait_for_completion = True
74+ )
75+ logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
76+
77+ exp_basename = f"{ Path (input_file ).stem } .{ mode } .{ output_mode } .txt"
78+ exp_file = os .path .join (data_dir , "expected" , exp_basename )
79+ # verify extracted text
80+ assert_extracted_text (exp_file , whisper_result , mode , output_mode )
81+ usage_after = client_v2 .get_usage_info ()
82+ # Verify usage after extraction
83+ verify_usage (usage_before , usage_after , page_count , mode )
84+
85+
86+ def assert_extracted_text (file_path , whisper_result , mode , output_mode ):
87+ with open (file_path , encoding = "utf-8" ) as f :
5488 exp = f .read ()
5589
5690 assert isinstance (whisper_result , dict )
5791 assert whisper_result ["status_code" ] == 200
5892
59- # For text based processing, perform a strict match
93+ # For OCR based processing
94+ threshold = 0.97
95+
96+ # For text based processing
6097 if mode == "native_text" and output_mode == "text" :
61- assert whisper_result ["extraction" ]["result_text" ] == exp
62- # For OCR based processing, perform a fuzzy match
63- else :
64- extracted_text = whisper_result ["extraction" ]["result_text" ]
65- similarity = SequenceMatcher (None , extracted_text , exp ).ratio ()
66- threshold = 0.97
67-
68- if similarity < threshold :
69- diff = "\n " .join (
70- unified_diff (exp .splitlines (), extracted_text .splitlines (), fromfile = "Expected" , tofile = "Extracted" )
71- )
72- pytest .fail (f"Texts are not similar enough: { similarity * 100 :.2f} % similarity. Diff:\n { diff } " )
98+ threshold = 0.99
99+ extracted_text = whisper_result ["extraction" ]["result_text" ]
100+ similarity = SequenceMatcher (None , extracted_text , exp ).ratio ()
101+
102+ if similarity < threshold :
103+ diff = "\n " .join (
104+ unified_diff (exp .splitlines (), extracted_text .splitlines (), fromfile = "Expected" , tofile = "Extracted" )
105+ )
106+ pytest .fail (f"Texts are not similar enough: { similarity * 100 :.2f} % similarity. Diff:\n { diff } " )
107+
108+
109+ def verify_usage (before_extract , after_extract , page_count , mode = 'form' ):
110+ all_modes = ['form' , 'high_quality' , 'low_cost' , 'native_text' ]
111+ all_modes .remove (mode )
112+ assert (after_extract ['today_page_count' ] == before_extract ['today_page_count' ] + page_count ), \
113+ "today_page_count calculation is wrong"
114+ assert (after_extract ['current_page_count' ] == before_extract ['current_page_count' ] + page_count ), \
115+ "current_page_count calculation is wrong"
116+ if after_extract ['overage_page_count' ] > 0 :
117+ assert (after_extract ['overage_page_count' ] == before_extract ['overage_page_count' ] + page_count ), \
118+ "overage_page_count calculation is wrong"
119+ assert (after_extract [f'current_page_count_{ mode } ' ] == before_extract [f'current_page_count_{ mode } ' ] + page_count ), \
120+ f"{ mode } mode calculation is wrong"
121+ for i in range (len (all_modes )):
122+ assert (after_extract [f'current_page_count_{ all_modes [i ]} ' ] ==
123+ before_extract [f'current_page_count_{ all_modes [i ]} ' ]), \
124+ f"{ all_modes [i ]} mode calculation is wrong"
0 commit comments