@@ -69,3 +69,72 @@ def test_whisper_v2(client_v2, data_dir, output_mode, mode, input_file):
6969 unified_diff (exp .splitlines (), extracted_text .splitlines (), fromfile = "Expected" , tofile = "Extracted" )
7070 )
7171 pytest .fail (f"Texts are not similar enough: { similarity * 100 :.2f} % similarity. Diff:\n { diff } " )
72+
73+
74+ @pytest .mark .parametrize (
75+ "output_mode, mode, url, input_file, page_count" ,
76+ [
77+ ("layout_preserving" , "native_text" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
78+ "credit_card.pdf" , 7 ),
79+ ("layout_preserving" , "low_cost" , "https://unstractpocstorage.blob.core.windows.net/public/Amex.pdf" ,
80+ "credit_card.pdf" , 7 ),
81+ (
82+ "layout_preserving" , "high_quality" ,
83+ "https://unstractpocstorage.blob.core.windows.net/public/scanned_bill.pdf" ,
84+ "restaurant_invoice_photo.pdf" , 1 ),
85+ ("layout_preserving" , "form" , "https://unstractpocstorage.blob.core.windows.net/public/scanned_form.pdf" ,
86+ "handwritten-form.pdf" , 1 ),
87+ ]
88+ )
89+ def test_whisper_v2_url_in_post (client_v2 , data_dir , output_mode , mode , url , input_file , page_count ):
90+ usage_before = client_v2 .get_usage_info ()
91+ whisper_result = client_v2 .whisper (
92+ mode = mode , output_mode = output_mode , url = url , wait_for_completion = True
93+ )
94+ logger .debug (f"Result for '{ output_mode } ', '{ mode } ', " f"'{ input_file } : { whisper_result } " )
95+
96+ exp_basename = f"{ Path (input_file ).stem } .{ mode } .{ output_mode } .txt"
97+ exp_file = os .path .join (data_dir , "expected" , exp_basename )
98+ with open (exp_file , encoding = "utf-8" ) as f :
99+ exp = f .read ()
100+
101+ assert isinstance (whisper_result , dict )
102+ assert whisper_result ["status_code" ] == 200
103+
104+ # For text based processing, perform a strict match
105+ if mode == "native_text" and output_mode == "text" :
106+ assert whisper_result ["extraction" ]["result_text" ] == exp
107+ # For OCR based processing, perform a fuzzy match
108+ else :
109+ extracted_text = whisper_result ["extraction" ]["result_text" ]
110+ similarity = SequenceMatcher (None , extracted_text , exp ).ratio ()
111+ threshold = 0.97
112+
113+ if similarity < threshold :
114+ diff = "\n " .join (
115+ unified_diff (exp .splitlines (), extracted_text .splitlines (), fromfile = "Expected" , tofile = "Extracted" )
116+ )
117+ pytest .fail (f"Texts are not similar enough: { similarity * 100 :.2f} % similarity. Diff:\n { diff } " )
118+
119+ usage_after = client_v2 .get_usage_info ()
120+ # Verify usage after extraction
121+ verify_usage (usage_before , usage_after , page_count , mode )
122+
123+
124+ def verify_usage (before_extract , after_extract , page_count , mode = 'form' ):
125+ all_modes = ['form' , 'high_quality' , 'low_cost' , 'native_text' ]
126+ all_modes .remove (mode )
127+ assert (after_extract ['today_page_count' ] == before_extract ['today_page_count' ] + page_count ), \
128+ "today_page_count calculation is wrong"
129+ if after_extract ['current_page_count' ] != - 1 :
130+ assert (after_extract ['current_page_count' ] == before_extract ['current_page_count' ] + page_count ), \
131+ "current_page_count calculation is wrong"
132+ if after_extract ['overage_page_count' ] > 0 :
133+ assert (after_extract ['overage_page_count' ] == before_extract ['overage_page_count' ] + page_count ), \
134+ "overage_page_count calculation is wrong"
135+ assert (after_extract [f'current_page_count_{ mode } ' ] == before_extract [f'current_page_count_{ mode } ' ] + page_count ), \
136+ f"{ mode } mode calculation is wrong"
137+ for i in range (len (all_modes )):
138+ assert (after_extract [f'current_page_count_{ all_modes [i ]} ' ] ==
139+ before_extract [f'current_page_count_{ all_modes [i ]} ' ]), \
140+ f"{ all_modes [i ]} mode calculation is wrong"
0 commit comments