@@ -106,20 +106,30 @@ def test_calculate_edit_distance():
106106
107107
108108@pytest .mark .parametrize (
109- ("filename" , "expected_score" , "expected_distance" ),
109+ ("filename" , "standardize_whitespaces" , " expected_score" , "expected_distance" ),
110110 [
111- ("fake-text.txt" , 0.78 , 38 ),
111+ ("fake-text.txt" , False , 0.78 , 38 ),
112+ ("fake-text.txt" , True , 0.92 , 12 ),
112113 ],
113114)
114- def test_calculate_edit_distance_with_filename (filename , expected_score , expected_distance ):
115+ def test_calculate_edit_distance_with_filename (
116+ filename , standardize_whitespaces , expected_score , expected_distance
117+ ):
115118 with open ("example-docs/fake-text.txt" ) as f :
116119 source_cct = f .read ()
117120
118121 elements = partition (filename = f"example-docs/{ filename } " )
119122 output_cct = "\n " .join ([str (el ) for el in elements ])
120123
121- score = text_extraction .calculate_edit_distance (output_cct , source_cct , return_as = "score" )
122- distance = text_extraction .calculate_edit_distance (output_cct , source_cct , return_as = "distance" )
124+ score = text_extraction .calculate_edit_distance (
125+ output_cct , source_cct , return_as = "score" , standardize_whitespaces = standardize_whitespaces
126+ )
127+ distance = text_extraction .calculate_edit_distance (
128+ output_cct ,
129+ source_cct ,
130+ return_as = "distance" ,
131+ standardize_whitespaces = standardize_whitespaces ,
132+ )
123133
124134 assert score >= 0
125135 assert score <= 1.0
@@ -128,6 +138,109 @@ def test_calculate_edit_distance_with_filename(filename, expected_score, expecte
128138 assert distance == expected_distance
129139
130140
141+ @pytest .mark .parametrize (
142+ ("text1" , "text2" ),
143+ [
144+ (
145+ "The dog\r loved the cat, but\t \n the cat\t loved the\n cow" ,
146+ "The dog loved the cat, but the cat loved the cow" ,
147+ ),
148+ (
149+ "Hello my\t name\t is H a r p e r, \n what's your\v name?" ,
150+ "Hello my name is H a r p e r, what's your name?" ,
151+ ),
152+ (
153+ "I have a\t \n \t dog and a\t cat,\f I love my\n \n \n \n dog." ,
154+ "I have a dog and a cat, I love my dog." ,
155+ ),
156+ (
157+ """
158+ Name Age City Occupation
159+ Alice 30 New York Engineer
160+ Bob 25 Los Angeles Designer
161+ Charlie 35 Chicago Teacher
162+ David 40 San Francisco Developer
163+ """ ,
164+ """
165+ Name\t Age\t City\t Occupation
166+ Alice\t 30\t New York\t Engineer
167+ Bob\t 25\t Los Angeles\t Designer
168+ Charlie\t 35\t Chicago\t Teacher
169+ David\t 40\t San Francisco\t Developer
170+ """ ,
171+ ),
172+ (
173+ """
174+ Name\t Age\t City\t Occupation
175+ Alice\t 30\t New York\t Engineer
176+ Bob\t 25\t Los Angeles\t Designer
177+ Charlie\t 35\t Chicago\t Teacher
178+ David\t 40\t San Francisco\t Developer
179+ """ ,
180+ "Name\t Age\t City\t Occupation\n \n \n Alice\t 30\t New York\t Engineer\n Bob\t 25\t Los Angeles\t Designer\n Charlie\t 35\t Chicago\t Teacher\n David\t 40\t San Francisco\t Developer" , # noqa: E501
181+ ),
182+ ],
183+ )
184+ def test_calculate_edit_distance_with_various_whitespace_1 (text1 , text2 ):
185+ assert (
186+ text_extraction .calculate_edit_distance (
187+ text1 , text2 , return_as = "score" , standardize_whitespaces = True
188+ )
189+ == 1.0
190+ )
191+ assert (
192+ text_extraction .calculate_edit_distance (
193+ text1 , text2 , return_as = "distance" , standardize_whitespaces = True
194+ )
195+ == 0
196+ )
197+ assert (
198+ text_extraction .calculate_edit_distance (
199+ text1 , text2 , return_as = "score" , standardize_whitespaces = False
200+ )
201+ < 1.0
202+ )
203+ assert (
204+ text_extraction .calculate_edit_distance (
205+ text1 , text2 , return_as = "distance" , standardize_whitespaces = False
206+ )
207+ > 0
208+ )
209+
210+
211+ def test_calculate_edit_distance_with_various_whitespace_2 ():
212+ source_cct_tabs = """
213+ Name\t Age\t City\t Occupation
214+ Alice\t 30\t New York\t Engineer
215+ Bob\t 25\t Los Angeles\t Designer
216+ Charlie\t 35\t Chicago\t Teacher
217+ David\t 40\t San Francisco\t Developer
218+ """
219+ source_cct_with_borders = """
220+
221+ | Name | Age | City | Occupation |
222+ |---------|-----|--------------|----------------|
223+ | Alice | 30 | New York | Engineer |
224+ | Bob | 25 | Los Angeles | Designer |
225+ | Charlie | 35 | Chicago | Teacher |
226+ | David | 40 | San Francisco| Developer |
227+
228+ """
229+ assert text_extraction .calculate_edit_distance (
230+ source_cct_tabs , source_cct_with_borders , return_as = "score" , standardize_whitespaces = True
231+ ) > text_extraction .calculate_edit_distance (
232+ source_cct_tabs , source_cct_with_borders , return_as = "score" , standardize_whitespaces = False
233+ )
234+ assert text_extraction .calculate_edit_distance (
235+ source_cct_tabs , source_cct_with_borders , return_as = "distance" , standardize_whitespaces = True
236+ ) < text_extraction .calculate_edit_distance (
237+ source_cct_tabs ,
238+ source_cct_with_borders ,
239+ return_as = "distance" ,
240+ standardize_whitespaces = False ,
241+ )
242+
243+
131244@pytest .mark .parametrize (
132245 ("text" , "expected" ),
133246 [
@@ -187,6 +300,46 @@ def test_bag_of_words(text, expected):
187300 assert text_extraction .bag_of_words (text ) == expected
188301
189302
303+ @pytest .mark .parametrize (
304+ ("text" , "expected" ),
305+ [
306+ (
307+ "The dog\r loved the cat, but\t \n the cat\t loved the\n cow\n \n " ,
308+ "The dog loved the cat, but the cat loved the cow" ,
309+ ),
310+ (
311+ "\n \n Hello my\t name\t is H a r p e r, \n what's your\v name?" ,
312+ "Hello my name is H a r p e r, what's your name?" ,
313+ ),
314+ (
315+ "I have a\t \n \t dog and a\t cat,\f I love my\n \n \n \n dog." ,
316+ "I have a dog and a cat, I love my dog." ,
317+ ),
318+ (
319+ """L is for the way you look at me
320+ O is for the only one I see
321+ V is very, very extraordinary
322+ E is even more than anyone that you adore can""" ,
323+ "L is for the way you look at me O is for the only one I see V is very, very extraordinary E is even more than anyone that you adore can" , # noqa: E501
324+ ),
325+ (
326+ """
327+ | Name | Age | City | Occupation |
328+ |---------|-----|--------------|----------------|
329+ | Alice | 30 | New York | Engineer |
330+ | Bob | 25 | Los Angeles | Designer |
331+ | Charlie | 35 | Chicago | Teacher |
332+ | David | 40 | San Francisco| Developer |
333+ """ ,
334+ "| Name | Age | City | Occupation | |---------|-----|--------------|----------------| | Alice | 30 | New York | Engineer | | Bob | 25 | Los Angeles | Designer | | Charlie | 35 | Chicago | Teacher | | David | 40 | San Francisco| Developer |" , # noqa: E501
335+ ),
336+ ],
337+ )
338+ def test_prepare_string (text , expected ):
339+ assert text_extraction .prepare_str (text , standardize_whitespaces = True ) == expected
340+ assert text_extraction .prepare_str (text ) == text
341+
342+
190343@pytest .mark .parametrize (
191344 ("output_text" , "source_text" , "expected_percentage" ),
192345 [
0 commit comments