Skip to content

Commit acd070c

Browse files
feat: enhance pdfminer element cleanup (#3593)
This PR aims to expand removal of `pdfminer` elements to include those inside all `non-pdfminer` elements, not just `tables`. --------- Co-authored-by: ryannikolaidis <[email protected]> Co-authored-by: christinestraub <[email protected]>
1 parent d51fb13 commit acd070c

File tree

5 files changed

+9
-176
lines changed

5 files changed

+9
-176
lines changed

CHANGELOG.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
1-
## 0.15.10-dev2
1+
## 0.15.10-dev3
22

33
### Enhancements
44

5+
* **Enhance `pdfminer` element cleanup** Expand removal of `pdfminer` elements to include those inside all `non-pdfminer` elements, not just `tables`.
56
* **Modified analysis drawing tools to dump to files and draw from dumps** If the parameter `analysis` of the `partition_pdf` function is set to `True`, the layout for Object Detection, Pdfminer Extraction, OCR and final layouts will be dumped as json files. The drawers now accept dict (dump) objects instead of internal classes instances.
67
* **Vectorize pdfminer elements deduplication computation**. Use `numpy` operations to compute IOU and sub-region membership instead of using simply loop. This improves the speed of deduplicating elements for pages with a lot of elements.
78

test_unstructured_ingest/expected-structured-output/google-drive/recalibrating-risk-report.pdf.json

Lines changed: 1 addition & 125 deletions
Original file line numberDiff line numberDiff line change
@@ -3595,133 +3595,9 @@
35953595
}
35963596
}
35973597
},
3598-
{
3599-
"type": "Image",
3600-
"element_id": "b0197950e1af5c2aac10f5b67d61524a",
3601-
"text": "",
3602-
"metadata": {
3603-
"filetype": "application/pdf",
3604-
"languages": [
3605-
"eng"
3606-
],
3607-
"page_number": 8,
3608-
"data_source": {
3609-
"url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
3610-
"record_locator": {
3611-
"file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
3612-
},
3613-
"date_created": "1718723636.34",
3614-
"date_modified": "1676196572.0",
3615-
"permissions_data": [
3616-
{
3617-
"id": "18298851591250030956",
3618-
"displayName": "[email protected]",
3619-
"type": "user",
3620-
"kind": "drive#permission",
3621-
"photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
3622-
"emailAddress": "[email protected]",
3623-
"role": "writer",
3624-
"deleted": false,
3625-
"pendingOwner": false
3626-
},
3627-
{
3628-
"id": "04774006893477068632",
3629-
"displayName": "ryan",
3630-
"type": "user",
3631-
"kind": "drive#permission",
3632-
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
3633-
"emailAddress": "[email protected]",
3634-
"role": "writer",
3635-
"deleted": false,
3636-
"pendingOwner": false
3637-
},
3638-
{
3639-
"id": "anyoneWithLink",
3640-
"type": "anyone",
3641-
"kind": "drive#permission",
3642-
"role": "reader",
3643-
"allowFileDiscovery": false
3644-
},
3645-
{
3646-
"id": "09147371668407854156",
3647-
"displayName": "roman",
3648-
"type": "user",
3649-
"kind": "drive#permission",
3650-
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
3651-
"emailAddress": "[email protected]",
3652-
"role": "owner",
3653-
"deleted": false,
3654-
"pendingOwner": false
3655-
}
3656-
]
3657-
}
3658-
}
3659-
},
3660-
{
3661-
"type": "Image",
3662-
"element_id": "34d2dd4af420ea3fdddc8fc5d581cac2",
3663-
"text": "",
3664-
"metadata": {
3665-
"filetype": "application/pdf",
3666-
"languages": [
3667-
"eng"
3668-
],
3669-
"page_number": 8,
3670-
"data_source": {
3671-
"url": "https://drive.google.com/uc?id=1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV&export=download",
3672-
"record_locator": {
3673-
"file_id": "1m1TUgyLv0hHdlsuL7DOWBAKQtvrhWNiV"
3674-
},
3675-
"date_created": "1718723636.34",
3676-
"date_modified": "1676196572.0",
3677-
"permissions_data": [
3678-
{
3679-
"id": "18298851591250030956",
3680-
"displayName": "[email protected]",
3681-
"type": "user",
3682-
"kind": "drive#permission",
3683-
"photoLink": "https://lh3.googleusercontent.com/a/ACg8ocJok2KRwwYvrEDkeZVCYosHOMoa52GZa2qIIC1jScCRoFLHaQ=s64",
3684-
"emailAddress": "[email protected]",
3685-
"role": "writer",
3686-
"deleted": false,
3687-
"pendingOwner": false
3688-
},
3689-
{
3690-
"id": "04774006893477068632",
3691-
"displayName": "ryan",
3692-
"type": "user",
3693-
"kind": "drive#permission",
3694-
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjXeWpu7QcZuYqIl3p1mwqzS8XGFJ4RqA3Xjljfkm1DcFZ9M7A=s64",
3695-
"emailAddress": "[email protected]",
3696-
"role": "writer",
3697-
"deleted": false,
3698-
"pendingOwner": false
3699-
},
3700-
{
3701-
"id": "anyoneWithLink",
3702-
"type": "anyone",
3703-
"kind": "drive#permission",
3704-
"role": "reader",
3705-
"allowFileDiscovery": false
3706-
},
3707-
{
3708-
"id": "09147371668407854156",
3709-
"displayName": "roman",
3710-
"type": "user",
3711-
"kind": "drive#permission",
3712-
"photoLink": "https://lh3.googleusercontent.com/a-/ALV-UjWoGrFCgXcF6CtiBIBLnAfM68qUnQaJOcgvg3qzfQ3W8Ch6dA=s64",
3713-
"emailAddress": "[email protected]",
3714-
"role": "owner",
3715-
"deleted": false,
3716-
"pendingOwner": false
3717-
}
3718-
]
3719-
}
3720-
}
3721-
},
37223598
{
37233599
"type": "FigureCaption",
3724-
"element_id": "a8ac039aa1d77ac96ecd4c8c14a556d5",
3600+
"element_id": "7803862f2804d04dfe8c38c4a353001d",
37253601
"text": "Equally, it is well established that living without access to electricity results in illness and death around the world, caused by everything from not having access to modern healthcare to household air pollution. As of today, 770 million people around the world do not have access to electricity, with over 75% of that population living in Sub-Saharan Africa. The world's poorest 4 billion people consume a mere 5% of the energy used in developed economies, and we need to find ways of delivering reliable electricity to the entire human population in a fashion that is sustainable. Household and ambient air pollution causes 8.7 million deaths each year, largely because of the continued use of fossil fuels. Widespread electrification is a key tool for delivering a just energy transition. Investment in nuclear, has become an urgent necessity. Discarding it, based on risk perceptions divorced from science, would be to abandon the moral obligation to ensure affordable, reliable, and sustainable energy for every community around the world.",
37263602
"metadata": {
37273603
"filetype": "application/pdf",

test_unstructured_ingest/expected-structured-output/s3/recalibrating-risk-report.pdf.json

Lines changed: 1 addition & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -1275,53 +1275,9 @@
12751275
}
12761276
}
12771277
},
1278-
{
1279-
"type": "Image",
1280-
"element_id": "b0197950e1af5c2aac10f5b67d61524a",
1281-
"text": "",
1282-
"metadata": {
1283-
"filetype": "application/pdf",
1284-
"languages": [
1285-
"eng"
1286-
],
1287-
"page_number": 8,
1288-
"data_source": {
1289-
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
1290-
"version": "e690f37ef36368a509d150f373a0bbe0",
1291-
"record_locator": {
1292-
"protocol": "s3",
1293-
"remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
1294-
},
1295-
"date_created": "1676196572.0",
1296-
"date_modified": "1676196572.0"
1297-
}
1298-
}
1299-
},
1300-
{
1301-
"type": "Image",
1302-
"element_id": "34d2dd4af420ea3fdddc8fc5d581cac2",
1303-
"text": "",
1304-
"metadata": {
1305-
"filetype": "application/pdf",
1306-
"languages": [
1307-
"eng"
1308-
],
1309-
"page_number": 8,
1310-
"data_source": {
1311-
"url": "s3://utic-dev-tech-fixtures/small-pdf-set/recalibrating-risk-report.pdf",
1312-
"version": "e690f37ef36368a509d150f373a0bbe0",
1313-
"record_locator": {
1314-
"protocol": "s3",
1315-
"remote_file_path": "s3://utic-dev-tech-fixtures/small-pdf-set/"
1316-
},
1317-
"date_created": "1676196572.0",
1318-
"date_modified": "1676196572.0"
1319-
}
1320-
}
1321-
},
13221278
{
13231279
"type": "FigureCaption",
1324-
"element_id": "a8ac039aa1d77ac96ecd4c8c14a556d5",
1280+
"element_id": "7803862f2804d04dfe8c38c4a353001d",
13251281
"text": "Equally, it is well established that living without access to electricity results in illness and death around the world, caused by everything from not having access to modern healthcare to household air pollution. As of today, 770 million people around the world do not have access to electricity, with over 75% of that population living in Sub-Saharan Africa. The world's poorest 4 billion people consume a mere 5% of the energy used in developed economies, and we need to find ways of delivering reliable electricity to the entire human population in a fashion that is sustainable. Household and ambient air pollution causes 8.7 million deaths each year, largely because of the continued use of fossil fuels. Widespread electrification is a key tool for delivering a just energy transition. Investment in nuclear, has become an urgent necessity. Discarding it, based on risk perceptions divorced from science, would be to abandon the moral obligation to ensure affordable, reliable, and sustainable energy for every community around the world.",
13261282
"metadata": {
13271283
"filetype": "application/pdf",

unstructured/__version__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.15.10-dev2" # pragma: no cover
1+
__version__ = "0.15.10-dev3" # pragma: no cover

unstructured/partition/pdf_image/pdfminer_processing.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
223223
"""
224224

225225
for page in document.pages:
226-
table_boxes = [e.bbox for e in page.elements if e.type == ElementType.TABLE]
226+
non_pdfminer_element_boxes = [e.bbox for e in page.elements if e.source != Source.PDFMINER]
227227
element_boxes = []
228228
element_to_subregion_map = {}
229229
subregion_indice = 0
@@ -234,10 +234,10 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
234234
element_to_subregion_map[i] = subregion_indice
235235
subregion_indice += 1
236236

237-
is_element_subregion_of_tables = (
237+
is_element_subregion_of_other_elements = (
238238
bboxes1_is_almost_subregion_of_bboxes2(
239239
element_boxes,
240-
table_boxes,
240+
non_pdfminer_element_boxes,
241241
env_config.EMBEDDED_TEXT_AGGREGATION_SUBREGION_THRESHOLD,
242242
).sum(axis=1)
243243
== 1
@@ -248,7 +248,7 @@ def clean_pdfminer_inner_elements(document: "DocumentLayout") -> "DocumentLayout
248248
for i, e in enumerate(page.elements)
249249
if (
250250
(i not in element_to_subregion_map)
251-
or not is_element_subregion_of_tables[element_to_subregion_map[i]]
251+
or not is_element_subregion_of_other_elements[element_to_subregion_map[i]]
252252
)
253253
]
254254

0 commit comments

Comments
 (0)