88from typing import Literal
99from unittest .mock import patch
1010
11- import pandas as pd
1211import pytest
1312from azure .ai .formrecognizer import AnalyzeResult
1413
@@ -148,11 +147,15 @@ def result(self) -> AnalyzeResult:
148147 docs = out ["documents" ]
149148 assert len (docs ) == 2
150149 # Checking the table doc extracted
151- assert docs [0 ].content_type == "table"
152- assert docs [0 ].dataframe .shape [0 ] == 4 # number of rows
153- assert docs [0 ].dataframe .shape [1 ] == 4 # number of columns
154- assert list (docs [0 ].dataframe .columns ) == ["" , "Column 1" , "Column 2" , "Column 3" ]
155- assert list (docs [0 ].dataframe .iloc [3 ]) == ["D" , "$54.35" , "$6345." , "" ]
150+ assert (
151+ docs [0 ].content
152+ == """,Column 1,Column 2,Column 3
153+ A,324,55 million units,2022
154+ B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
155+ C,23.53%,A short string.,
156+ D,$54.35,$6345.,
157+ """
158+ )
156159 assert (
157160 docs [0 ].meta ["preceding_context" ] == "specification. These proprietary technologies are not "
158161 "standardized and their\n specification is published only on "
@@ -191,13 +194,21 @@ def result(self) -> AnalyzeResult:
191194 docs = out ["documents" ]
192195 assert len (docs ) == 2
193196 # Checking the table doc extracted that is missing bounding info
194- assert docs [0 ].content_type == "table"
195- assert docs [0 ].dataframe .shape [0 ] == 4 # number of rows
196- assert docs [0 ].dataframe .shape [1 ] == 4 # number of columns
197- assert list (docs [0 ].dataframe .columns ) == ["" , "Column 1" , "Column 2" , "Column 3" ]
198- assert list (docs [0 ].dataframe .iloc [3 ]) == ["D" , "$54.35" , "$6345." , "" ]
199- # TODO below assert fails
200- # assert docs[0].meta["preceding_context"] == ""
197+ assert (
198+ docs [0 ].content
199+ == """,Column 1,Column 2,Column 3
200+ A,324,55 million units,2022
201+ B,"234,523.00",The quick brown fox jumped over the lazy dog.,54x growth
202+ C,23.53%,A short string.,
203+ D,$54.35,$6345.,
204+ """
205+ )
206+ assert docs [0 ].meta ["preceding_context" ] == (
207+ "specification. These proprietary technologies are not standardized and their\n specification is published "
208+ "only on Adobe's website. Many of them are also not\n supported by popular third-party implementations of "
209+ "PDF."
210+ )
211+ assert docs [0 ].meta ["following_context" ] == ""
201212
202213 @patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
203214 def test_azure_converter_with_multicolumn_header_table (self , mock_resolve_value , test_files_path ) -> None :
@@ -213,20 +224,17 @@ def result(self) -> AnalyzeResult:
213224 azure_mock .return_value = MockPoller ()
214225 ocr_node = AzureOCRDocumentConverter (endpoint = "" )
215226
216- # TODO: fails because of non-unique column names, azure_sample_pdf_3.json has duplicate column names
217227 out = ocr_node .run (sources = [test_files_path / "pdf" / "sample_pdf_3.pdf" ])
218228
219229 docs = out ["documents" ]
220230 assert len (docs ) == 2
221- assert docs [0 ].content_type == "table"
222- assert docs [0 ].dataframe .shape [0 ] == 1 # number of rows
223- assert docs [0 ].dataframe .shape [1 ] == 3 # number of columns
224- assert list (docs [0 ].dataframe .columns ) == ["This is a subheader" , "This is a subheader" , "This is a subheader" ]
225- assert list (docs [0 ].dataframe .iloc [0 ]) == ["Value 1" , "Value 2" , "Val 3" ]
231+ assert docs [0 ].content == "This is a subheader,This is a subheader,This is a subheader\n Value 1,Value 2,Val 3\n "
226232 assert (
227233 docs [0 ].meta ["preceding_context" ]
228234 == "Table 1. This is an example table with two multicolumn headers\n Header 1"
229235 )
236+ assert docs [0 ].meta ["following_context" ] == ""
237+ assert docs [0 ].meta ["page" ] == 1
230238
231239 @patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
232240 def test_table_pdf_with_non_empty_meta (self , mock_resolve_value , test_files_path ) -> None :
@@ -244,7 +252,6 @@ def result(self) -> AnalyzeResult:
244252 out = ocr_node .run (sources = [test_files_path / "pdf" / "sample_pdf_1.pdf" ], meta = [{"test" : "value_1" }])
245253
246254 docs = out ["documents" ]
247- # TODO assert below changed from the original test
248255 assert docs [1 ].meta ["test" ] == "value_1"
249256
250257 @pytest .mark .integration
@@ -307,27 +314,6 @@ def test_run_with_store_full_path_false(self, test_files_path):
307314 assert "Sample Docx File" in documents [0 ].content
308315 assert documents [0 ].meta ["file_path" ] == "sample_docx.docx"
309316
310- @patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
311- def test_hashing_dataframe (self , mock_resolve_value ):
312- mock_resolve_value .return_value = "test_api_key"
313- component = AzureOCRDocumentConverter (endpoint = "" )
314- hash_length = 32
315-
316- df = pd .DataFrame ({"A" : [1 , 2 , 3 ]})
317- hash_string_1 = component ._hash_dataframe (df )
318- assert len (hash_string_1 ) == hash_length
319-
320- df = pd .DataFrame ({"A" : [1 , 2 , 3 ], "B" : [4 , 5 , 6 ]})
321- hash_string_2 = component ._hash_dataframe (df )
322- assert len (hash_string_2 ) == hash_length
323-
324- df = pd .DataFrame ({"B" : [4 , 5 , 6 ], "A" : [1 , 2 , 3 ], "D" : [7 , 8 , 9 ]})
325- hash_string_3 = component ._hash_dataframe (df )
326- assert len (hash_string_3 ) == hash_length
327-
328- # doesn't mean much, more for sanity check
329- assert hash_string_1 != hash_string_2 != hash_string_3
330-
331317 @patch ("haystack.utils.auth.EnvVarSecret.resolve_value" )
332318 def test_meta_from_byte_stream (self , mock_resolve_value , test_files_path ) -> None :
333319 mock_resolve_value .return_value = "test_api_key"
@@ -341,8 +327,8 @@ def result(self) -> AnalyzeResult:
341327 with patch ("azure.ai.formrecognizer.DocumentAnalysisClient.begin_analyze_document" ) as azure_mock :
342328 azure_mock .return_value = MockPoller ()
343329 ocr_node = AzureOCRDocumentConverter (endpoint = "" )
344- bytes = (test_files_path / "pdf" / "sample_pdf_1.pdf" ).read_bytes ()
345- byte_stream = ByteStream (data = bytes , meta = {"test_from" : "byte_stream" })
330+ bytes_ = (test_files_path / "pdf" / "sample_pdf_1.pdf" ).read_bytes ()
331+ byte_stream = ByteStream (data = bytes_ , meta = {"test_from" : "byte_stream" })
346332 out = ocr_node .run (sources = [byte_stream ], meta = [{"test" : "value_1" }])
347333
348334 docs = out ["documents" ]
0 commit comments