@@ -18,6 +18,7 @@ def test_get_usage_info(client_v2):
1818 "current_page_count_form" ,
1919 "current_page_count_high_quality" ,
2020 "current_page_count_native_text" ,
21+ "current_page_count_excel" ,
2122 "daily_quota" ,
2223 "monthly_quota" ,
2324 "overage_page_count" ,
@@ -44,7 +45,10 @@ def test_get_usage_info(client_v2):
4445def test_whisper_v2 (client_v2 , data_dir , output_mode , mode , input_file ):
4546 file_path = os .path .join (data_dir , input_file )
4647 whisper_result = client_v2 .whisper (
47- mode = mode , output_mode = output_mode , file_path = file_path , wait_for_completion = True
48+ mode = mode ,
49+ output_mode = output_mode ,
50+ file_path = file_path ,
51+ wait_for_completion = True ,
4852 )
4953 logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
5054
@@ -54,24 +58,62 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
5458 assert_extracted_text (exp_file , whisper_result , mode , output_mode )
5559
5660
61+ @pytest .mark .parametrize (
62+ "output_mode, mode, input_file" ,
63+ [
64+ ("layout_preserving" , "high_quality" , "test.json" ),
65+ ],
66+ )
67+ def test_whisper_v2_error (client_v2 , data_dir , output_mode , mode , input_file ):
68+ file_path = os .path .join (data_dir , input_file )
69+
70+ whisper_result = client_v2 .whisper (
71+ mode = mode ,
72+ output_mode = output_mode ,
73+ file_path = file_path ,
74+ wait_for_completion = True ,
75+ )
76+ logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
77+
78+ assert_error_message (whisper_result )
79+
80+
5781@pytest .mark .parametrize (
5882 "output_mode, mode, url, input_file, page_count" ,
5983 [
60- ("layout_preserving" , "native_text" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
61- "credit_card.pdf" , 7 ),
62- ("layout_preserving" , "low_cost" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
63- "credit_card.pdf" , 7 ),
64- ("layout_preserving" , "high_quality" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf" ,
65- "restaurant_invoice_photo.pdf" , 1 ),
66- ("layout_preserving" , "form" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf" ,
67- "handwritten-form.pdf" , 1 ),
68- ]
84+ (
85+ "layout_preserving" ,
86+ "native_text" ,
87+ "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
88+ "credit_card.pdf" ,
89+ 7 ,
90+ ),
91+ (
92+ "layout_preserving" ,
93+ "low_cost" ,
94+ "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
95+ "credit_card.pdf" ,
96+ 7 ,
97+ ),
98+ (
99+ "layout_preserving" ,
100+ "high_quality" ,
101+ "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf" ,
102+ "restaurant_invoice_photo.pdf" ,
103+ 1 ,
104+ ),
105+ (
106+ "layout_preserving" ,
107+ "form" ,
108+ "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf" ,
109+ "handwritten-form.pdf" ,
110+ 1 ,
111+ ),
112+ ],
69113)
70114def test_whisper_v2_url_in_post (client_v2 , data_dir , output_mode , mode , url , input_file , page_count ):
71115 usage_before = client_v2 .get_usage_info ()
72- whisper_result = client_v2 .whisper (
73- mode = mode , output_mode = output_mode , url = url , wait_for_completion = True
74- )
116+ whisper_result = client_v2 .whisper (mode = mode , output_mode = output_mode , url = url , wait_for_completion = True )
75117 logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
76118
77119 exp_basename = f"{ Path (input_file ).stem } .{ mode } .{ output_mode } .txt"
@@ -83,6 +125,12 @@ def test_whisper_v2_url_in_post(client_v2, data_dir, output_mode, mode, url, inp
83125 verify_usage (usage_before , usage_after , page_count , mode )
84126
85127
128+ def assert_error_message (whisper_result ):
129+ assert isinstance (whisper_result , dict )
130+ assert whisper_result ["status" ] == "error"
131+ assert "error" in whisper_result ["message" ]
132+
133+
86134def assert_extracted_text (file_path , whisper_result , mode , output_mode ):
87135 with open (file_path , encoding = "utf-8" ) as f :
88136 exp = f .read ()
@@ -91,34 +139,45 @@ def assert_extracted_text(file_path, whisper_result, mode, output_mode):
91139 assert whisper_result ["status_code" ] == 200
92140
93141 # For OCR based processing
94- threshold = 0.97
142+ threshold = 0.94
95143
96144 # For text based processing
97145 if mode == "native_text" and output_mode == "text" :
98146 threshold = 0.99
147+ elif mode == "low_cost" :
148+ threshold = 0.90
99149 extracted_text = whisper_result ["extraction" ]["result_text" ]
100150 similarity = SequenceMatcher (None , extracted_text , exp ).ratio ()
101151
102152 if similarity < threshold :
103153 diff = "\n " .join (
104- unified_diff (exp .splitlines (), extracted_text .splitlines (), fromfile = "Expected" , tofile = "Extracted" )
154+ unified_diff (
155+ exp .splitlines (),
156+ extracted_text .splitlines (),
157+ fromfile = "Expected" ,
158+ tofile = "Extracted" ,
159+ )
105160 )
106- pytest .fail (f"Texts are not similar enough: { similarity * 100 :.2f} % similarity. Diff: \n { diff } " )
161+ pytest .fail (f"Diff: \n { diff } . \n Texts are not similar enough: { similarity * 100 :.2f} % similarity. " )
107162
108163
109- def verify_usage (before_extract , after_extract , page_count , mode = ' form' ):
110- all_modes = [' form' , ' high_quality' , ' low_cost' , ' native_text' ]
164+ def verify_usage (before_extract , after_extract , page_count , mode = " form" ):
165+ all_modes = [" form" , " high_quality" , " low_cost" , " native_text" ]
111166 all_modes .remove (mode )
112- assert (after_extract ['today_page_count' ] == before_extract ['today_page_count' ] + page_count ), \
113- "today_page_count calculation is wrong"
114- assert (after_extract ['current_page_count' ] == before_extract ['current_page_count' ] + page_count ), \
115- "current_page_count calculation is wrong"
116- if after_extract ['overage_page_count' ] > 0 :
117- assert (after_extract ['overage_page_count' ] == before_extract ['overage_page_count' ] + page_count ), \
118- "overage_page_count calculation is wrong"
119- assert (after_extract [f'current_page_count_{ mode } ' ] == before_extract [f'current_page_count_{ mode } ' ] + page_count ), \
120- f"{ mode } mode calculation is wrong"
167+ assert (
168+ after_extract ["today_page_count" ] == before_extract ["today_page_count" ] + page_count
169+ ), "today_page_count calculation is wrong"
170+ assert (
171+ after_extract ["current_page_count" ] == before_extract ["current_page_count" ] + page_count
172+ ), "current_page_count calculation is wrong"
173+ if after_extract ["overage_page_count" ] > 0 :
174+ assert (
175+ after_extract ["overage_page_count" ] == before_extract ["overage_page_count" ] + page_count
176+ ), "overage_page_count calculation is wrong"
177+ assert (
178+ after_extract [f"current_page_count_{ mode } " ] == before_extract [f"current_page_count_{ mode } " ] + page_count
179+ ), f"{ mode } mode calculation is wrong"
121180 for i in range (len (all_modes )):
122- assert (after_extract [ f'current_page_count_ { all_modes [ i ] } ' ] ==
123- before_extract [f' current_page_count_{ all_modes [i ]} ' ]), \
124- f"{ all_modes [i ]} mode calculation is wrong"
181+ assert (
182+ after_extract [ f"current_page_count_ { all_modes [ i ] } " ] == before_extract [f" current_page_count_{ all_modes [i ]} " ]
183+ ), f"{ all_modes [i ]} mode calculation is wrong"
0 commit comments