33from tempfile import SpooledTemporaryFile
44
55import pytest
6+ from pytest_mock import MockFixture
67
78from test_unstructured .unit_utils import assert_round_trips_through_JSON , example_doc_path
89from unstructured .chunking .title import chunk_by_title
2425
2526
2627def test_partition_ppt_from_filename ():
27- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
28- elements = partition_ppt (filename = filename )
28+ elements = partition_ppt (example_doc_path ("fake-power-point.ppt" ))
2929 assert elements == EXPECTED_PPT_OUTPUT
3030 for element in elements :
3131 assert element .metadata .filename == "fake-power-point.ppt"
@@ -34,158 +34,131 @@ def test_partition_ppt_from_filename():
3434
3535
3636def test_partition_ppt_from_filename_with_metadata_filename ():
37- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
38- elements = partition_ppt (filename = filename , metadata_filename = "test" )
37+ elements = partition_ppt (example_doc_path ("fake-power-point.ppt" ), metadata_filename = "test" )
3938 assert all (element .metadata .filename == "test" for element in elements )
4039
4140
4241def test_partition_ppt_raises_with_missing_file ():
43- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "doesnt-exist.ppt" )
4442 with pytest .raises (ValueError ):
45- partition_ppt (filename = filename )
43+ partition_ppt (example_doc_path ( "doesnt-exist.ppt" ) )
4644
4745
4846def test_partition_ppt_from_file ():
49- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
50- with open (filename , "rb" ) as f :
47+ with open (example_doc_path ("fake-power-point.ppt" ), "rb" ) as f :
5148 elements = partition_ppt (file = f )
5249 assert elements == EXPECTED_PPT_OUTPUT
5350 for element in elements :
5451 assert element .metadata .filename is None
5552
5653
5754def test_partition_ppt_from_file_with_metadata_filename ():
58- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
59- with open (filename , "rb" ) as f :
55+ with open (example_doc_path ("fake-power-point.ppt" ), "rb" ) as f :
6056 elements = partition_ppt (file = f , metadata_filename = "test" )
6157 assert elements == EXPECTED_PPT_OUTPUT
6258 for element in elements :
6359 assert element .metadata .filename == "test"
6460
6561
6662def test_partition_ppt_raises_with_both_specified ():
67- filename = os . path . join ( EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
63+ filename = example_doc_path ( "fake-power-point.ppt" )
6864 with open (filename , "rb" ) as f , pytest .raises (ValueError ):
6965 partition_ppt (filename = filename , file = f )
7066
7167
72- def test_partition_ppt_raises_with_neither ():
68+ def test_partition_ppt_raises_when_neither_file_path_or_file_is_provided ():
7369 with pytest .raises (ValueError ):
7470 partition_ppt ()
7571
7672
7773def test_partition_ppt_from_filename_exclude_metadata ():
78- filename = os . path . join ( EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
74+ filename = example_doc_path ( "fake-power-point.ppt" )
7975 elements = partition_ppt (filename = filename , include_metadata = False )
8076 for i in range (len (elements )):
8177 assert elements [i ].metadata .to_dict () == {}
8278
8379
8480def test_partition_ppt_from_file_exclude_metadata ():
85- filename = os . path . join ( EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
81+ filename = example_doc_path ( "fake-power-point.ppt" )
8682 with open (filename , "rb" ) as f :
8783 elements = partition_ppt (file = f , include_metadata = False )
8884 for i in range (len (elements )):
8985 assert elements [i ].metadata .to_dict () == {}
9086
9187
92- def test_partition_ppt_metadata_date (
93- mocker ,
94- filename = "example-docs/fake-power-point.ppt" ,
88+ def test_partition_ppt_pulls_metadata_last_modified_from_disk_when_file_is_a_path (
89+ mocker : MockFixture ,
9590):
96- mocked_last_modification_date = "2029-07-05T09:24:28"
97-
91+ modified_date_on_disk = "2024-05-01T15:37:28"
9892 mocker .patch (
99- "unstructured.partition.ppt.get_last_modified_date" ,
100- return_value = mocked_last_modification_date ,
93+ "unstructured.partition.ppt.get_last_modified_date" , return_value = modified_date_on_disk
10194 )
10295
103- elements = partition_ppt (
104- filename = filename ,
105- )
96+ elements = partition_ppt (example_doc_path ("fake-power-point.ppt" ))
10697
107- assert elements [0 ].metadata .last_modified == mocked_last_modification_date
98+ assert elements [0 ].metadata .last_modified == modified_date_on_disk
10899
109100
110- def test_partition_ppt_with_custom_metadata_date (
111- mocker ,
112- filename = "example-docs/fake-power-point.ppt" ,
101+ def test_partition_ppt_uses_value_in_arg_not_disk_when_metadata_last_modified_arg_provided (
102+ mocker : MockFixture ,
113103):
114- mocked_last_modification_date = "2029-07-05T09:24:28"
115- expected_last_modification_date = "2020-07-05T09:24:28"
116-
104+ modified_date_on_disk = "2024-05-01T15:37:28"
105+ modified_date_in_arg = "2020-07-05T09:24:28"
117106 mocker .patch (
118- "unstructured.partition.ppt.get_last_modified_date" ,
119- return_value = mocked_last_modification_date ,
107+ "unstructured.partition.ppt.get_last_modified_date" , return_value = modified_date_on_disk
120108 )
121109
122110 elements = partition_ppt (
123- filename = filename ,
124- metadata_last_modified = expected_last_modification_date ,
111+ example_doc_path ("fake-power-point.ppt" ), metadata_last_modified = modified_date_in_arg
125112 )
126113
127- assert elements [0 ].metadata .last_modified == expected_last_modification_date
114+ assert elements [0 ].metadata .last_modified == modified_date_in_arg
128115
129116
130- def test_partition_ppt_from_file_metadata_date (
131- mocker ,
132- filename = "example-docs/fake-power-point.ppt" ,
133- ):
134- mocked_last_modification_date = "2029-07-05T09:24:28"
135-
117+ def test_partition_ppt_suppresses_modified_date_from_file_by_default (mocker : MockFixture ):
136118 mocker .patch (
137119 "unstructured.partition.ppt.get_last_modified_date_from_file" ,
138- return_value = mocked_last_modification_date ,
120+ return_value = "2029-07-05T09:24:28" ,
139121 )
140122
141- with open (filename , "rb" ) as f :
142- elements = partition_ppt (
143- file = f ,
144- )
123+ with open (example_doc_path ("fake-power-point.ppt" ), "rb" ) as f :
124+ elements = partition_ppt (file = f )
145125
146126 assert elements [0 ].metadata .last_modified is None
147127
148128
149- def test_partition_ppt_from_file_explicit_get_metadata_date (
150- mocker ,
151- filename = "example-docs/fake-power-point.ppt" ,
129+ def test_partition_ppt_pulls_modified_date_from_file_when_date_from_file_object_arg_is_True (
130+ mocker : MockFixture ,
152131):
153- mocked_last_modification_date = "2029-07-05T09:24:28"
154-
132+ modified_date_on_file = "2029-07-05T09:24:28"
155133 mocker .patch (
156134 "unstructured.partition.ppt.get_last_modified_date_from_file" ,
157- return_value = mocked_last_modification_date ,
135+ return_value = modified_date_on_file ,
158136 )
159137
160- with open (filename , "rb" ) as f :
138+ with open (example_doc_path ( "fake-power-point.ppt" ) , "rb" ) as f :
161139 elements = partition_ppt (file = f , date_from_file_object = True )
162140
163- assert elements [0 ].metadata .last_modified == mocked_last_modification_date
141+ assert elements [0 ].metadata .last_modified == modified_date_on_file
164142
165143
166- def test_partition_ppt_from_file_with_custom_metadata_date (
167- mocker ,
168- filename = "example-docs/fake-power-point.ppt" ,
169- ):
170- mocked_last_modification_date = "2029-07-05T09:24:28"
171- expected_last_modification_date = "2020-07-05T09:24:28"
144+ def test_partition_ppt_from_file_with_custom_metadata_date (mocker : MockFixture ):
145+ modified_date_on_file = "2029-07-05T09:24:28"
146+ modified_date_in_arg = "2020-07-05T09:24:28"
172147
173148 mocker .patch (
174149 "unstructured.partition.ppt.get_last_modified_date_from_file" ,
175- return_value = mocked_last_modification_date ,
150+ return_value = modified_date_on_file ,
176151 )
177152
178- with open (filename , "rb" ) as f :
179- elements = partition_ppt (file = f , metadata_last_modified = expected_last_modification_date )
153+ with open (example_doc_path ( "fake-power-point.ppt" ) , "rb" ) as f :
154+ elements = partition_ppt (file = f , metadata_last_modified = modified_date_in_arg )
180155
181- assert elements [0 ].metadata .last_modified == expected_last_modification_date
156+ assert elements [0 ].metadata .last_modified == modified_date_in_arg
182157
183158
184- def test_partition_ppt_from_file_without_metadata_date (
185- filename = "example-docs/fake-power-point.ppt" ,
186- ):
159+ def test_partition_ppt_from_file_without_metadata_date ():
187160 """Test partition_ppt() with file that are not possible to get last modified date"""
188- with open (filename , "rb" ) as f :
161+ with open (example_doc_path ( "fake-power-point.ppt" ) , "rb" ) as f :
189162 sf = SpooledTemporaryFile ()
190163 sf .write (f .read ())
191164 sf .seek (0 )
@@ -199,25 +172,24 @@ def test_partition_ppt_with_json():
199172 assert_round_trips_through_JSON (elements )
200173
201174
202- def test_add_chunking_strategy_by_title_on_partition_ppt (
203- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" ),
204- ):
205- elements = partition_ppt (filename = filename )
206- chunk_elements = partition_ppt (filename , chunking_strategy = "by_title" )
175+ def test_add_chunking_strategy_by_title_on_partition_ppt ():
176+ file_path = example_doc_path ("fake-power-point.ppt" )
177+ elements = partition_ppt (file_path )
178+ chunk_elements = partition_ppt (file_path , chunking_strategy = "by_title" )
207179 chunks = chunk_by_title (elements )
208180 assert chunk_elements != elements
209181 assert chunk_elements == chunks
210182
211183
212184def test_partition_ppt_element_metadata_has_languages ():
213- filename = os .path .join (EXAMPLE_DOCS_DIRECTORY , "fake-power-point.ppt" )
214- elements = partition_ppt (filename = filename )
185+ elements = partition_ppt (example_doc_path ("fake-power-point.ppt" ))
215186 assert elements [0 ].metadata .languages == ["eng" ]
216187
217188
218189def test_partition_ppt_respects_detect_language_per_element ():
219- filename = "example-docs/language-docs/eng_spa_mult.ppt"
220- elements = partition_ppt (filename = filename , detect_language_per_element = True )
190+ elements = partition_ppt (
191+ example_doc_path ("language-docs/eng_spa_mult.ppt" ), detect_language_per_element = True
192+ )
221193 langs = [element .metadata .languages for element in elements ]
222194 # languages other than English and Spanish are detected by this partitioner,
223195 # so this test is slightly different from the other partition tests
0 commit comments