Skip to content

Commit 74680fc

Browse files
authored
Omit coordinates unless requested (#149)
1 parent ef40417 commit 74680fc

File tree

8 files changed

+117
-196
lines changed

8 files changed

+117
-196
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1-
## 0.0.31-dev0
1+
## 0.0.31
22

33
* Add retry parameters on fanout requests
4+
* Bump unstructured library to 0.8.1
5+
* Fix how to remove an element's coordinate information
46

57
## 0.0.30
68

pipeline-notebooks/pipeline-general.ipynb

Lines changed: 6 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -814,16 +814,18 @@
814814
" df = convert_to_dataframe(elements)\n",
815815
" df[\"filename\"] = os.path.basename(filename)\n",
816816
" if not show_coordinates:\n",
817-
" df.drop(columns=[\"coordinates\"], inplace=True)\n",
817+
" columns_to_drop = [col for col in [\"coordinates_points\", \"coordinates_system\", \"coordinates_layout_width\", \"coordinates_layout_height\",] if col in df.columns]\n",
818+
" if columns_to_drop:\n",
819+
" df.drop(columns=columns_to_drop, inplace=True)\n",
818820
" \n",
819821
" return df.to_csv(index=False)\n",
820822
" \n",
821823
" result = convert_to_isd(elements)\n",
822824
" for element in result:\n",
823825
" element['metadata']['filename'] = os.path.basename(filename)\n",
824826
"\n",
825-
" if not show_coordinates:\n",
826-
" del element['coordinates']\n",
827+
" if not show_coordinates and \"coordinates\" in element[\"metadata\"]:\n",
828+
" del element[\"metadata\"][\"coordinates\"]\n",
827829
" \n",
828830
" return result"
829831
]
@@ -849,9 +851,6 @@
849851
"data": {
850852
"text/plain": [
851853
"[{'type': 'UncategorizedText',\n",
852-
" 'coordinate_system': None,\n",
853-
" 'layout_width': None,\n",
854-
" 'layout_height': None,\n",
855854
" 'element_id': 'db1ca22813f01feda8759ff04a844e56',\n",
856855
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
857856
" 'filetype': 'message/rfc822',\n",
@@ -861,9 +860,6 @@
861860
" 'filename': 'family-day.eml'},\n",
862861
" 'text': 'Hi All,'},\n",
863862
" {'type': 'NarrativeText',\n",
864-
" 'coordinate_system': None,\n",
865-
" 'layout_width': None,\n",
866-
" 'layout_height': None,\n",
867863
" 'element_id': 'a663c393a5e143c01ef2bb5c98efa2c1',\n",
868864
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
869865
" 'filetype': 'message/rfc822',\n",
@@ -873,9 +869,6 @@
873869
" 'filename': 'family-day.eml'},\n",
874870
" 'text': 'Get excited for our first annual family day!\\xa0'},\n",
875871
" {'type': 'NarrativeText',\n",
876-
" 'coordinate_system': None,\n",
877-
" 'layout_width': None,\n",
878-
" 'layout_height': None,\n",
879872
" 'element_id': 'ce65ca3bef59957d3f1c2bab5725c82f',\n",
880873
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
881874
" 'filetype': 'message/rfc822',\n",
@@ -885,9 +878,6 @@
885878
" 'filename': 'family-day.eml'},\n",
886879
" 'text': 'There will be face painting, a petting zoo, funnel cake and more.'},\n",
887880
" {'type': 'NarrativeText',\n",
888-
" 'coordinate_system': None,\n",
889-
" 'layout_width': None,\n",
890-
" 'layout_height': None,\n",
891881
" 'element_id': 'd7bcf988af9f06042d83e25c531e5744',\n",
892882
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
893883
" 'filetype': 'message/rfc822',\n",
@@ -897,9 +887,6 @@
897887
" 'filename': 'family-day.eml'},\n",
898888
" 'text': 'Make sure to RSVP!'},\n",
899889
" {'type': 'Title',\n",
900-
" 'coordinate_system': None,\n",
901-
" 'layout_width': None,\n",
902-
" 'layout_height': None,\n",
903890
" 'element_id': '5550577db69c2c8aabcd90979698120a',\n",
904891
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
905892
" 'filetype': 'message/rfc822',\n",
@@ -909,9 +896,6 @@
909896
" 'filename': 'family-day.eml'},\n",
910897
" 'text': 'Best.'},\n",
911898
" {'type': 'Title',\n",
912-
" 'coordinate_system': None,\n",
913-
" 'layout_width': None,\n",
914-
" 'layout_height': None,\n",
915899
" 'element_id': 'ca1c571d993b6c1ed8ef56a06c16ba22',\n",
916900
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
917901
" 'filetype': 'message/rfc822',\n",
@@ -921,9 +905,6 @@
921905
" 'filename': 'family-day.eml'},\n",
922906
" 'text': 'Mallori Harrell'},\n",
923907
" {'type': 'Title',\n",
924-
" 'coordinate_system': None,\n",
925-
" 'layout_width': None,\n",
926-
" 'layout_height': None,\n",
927908
" 'element_id': 'd5b612de8cd918addd9569b0255b65b2',\n",
928909
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
929910
" 'filetype': 'message/rfc822',\n",
@@ -933,9 +914,6 @@
933914
" 'filename': 'family-day.eml'},\n",
934915
" 'text': 'Unstructured Technologies'},\n",
935916
" {'type': 'Title',\n",
936-
" 'coordinate_system': None,\n",
937-
" 'layout_width': None,\n",
938-
" 'layout_height': None,\n",
939917
" 'element_id': '2e0b9e8ee04b9594a9c26d8535b818ff',\n",
940918
" 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n",
941919
" 'filetype': 'message/rfc822',\n",
@@ -977,7 +955,7 @@
977955
{
978956
"data": {
979957
"text/plain": [
980-
"'type,text,element_id,coordinate_system,layout_width,layout_height,filename,page_number,url,sent_from,sent_to,subject,sender\\nUncategorizedText,\"Hi All,\",db1ca22813f01feda8759ff04a844e56,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nNarrativeText,Get excited for our first annual family day!\\xa0,a663c393a5e143c01ef2bb5c98efa2c1,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nNarrativeText,\"There will be face painting, a petting zoo, funnel cake and more.\",ce65ca3bef59957d3f1c2bab5725c82f,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nNarrativeText,Make sure to RSVP!,d7bcf988af9f06042d83e25c531e5744,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nTitle,Best.,5550577db69c2c8aabcd90979698120a,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nTitle,Mallori Harrell,ca1c571d993b6c1ed8ef56a06c16ba22,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nTitle,Unstructured Technologies,d5b612de8cd918addd9569b0255b65b2,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\nTitle,Data Scientist,2e0b9e8ee04b9594a9c26d8535b818ff,,,,family-day.eml,,,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>\\n'"
958+
"'type,text,element_id,date,filetype,sent_from,sent_to,subject,sender,filename\\nUncategorizedText,\"Hi All,\",db1ca22813f01feda8759ff04a844e56,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nNarrativeText,Get excited for our first annual family day!\\xa0,a663c393a5e143c01ef2bb5c98efa2c1,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nNarrativeText,\"There will be face painting, a petting zoo, funnel cake and more.\",ce65ca3bef59957d3f1c2bab5725c82f,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nNarrativeText,Make sure to RSVP!,d7bcf988af9f06042d83e25c531e5744,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nTitle,Best.,5550577db69c2c8aabcd90979698120a,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nTitle,Mallori Harrell,ca1c571d993b6c1ed8ef56a06c16ba22,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nTitle,Unstructured Technologies,d5b612de8cd918addd9569b0255b65b2,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\nTitle,Data Scientist,2e0b9e8ee04b9594a9c26d8535b818ff,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell <[email protected]>\\'],[\\'Mallori Harrell <[email protected]>\\'],Family Day,Mallori Harrell <[email protected]>,family-day.eml\\n'"
981959
]
982960
},
983961
"execution_count": null,
@@ -1029,37 +1007,22 @@
10291007
"data": {
10301008
"text/plain": [
10311009
"[{'type': 'NarrativeText',\n",
1032-
" 'coordinate_system': None,\n",
1033-
" 'layout_width': None,\n",
1034-
" 'layout_height': None,\n",
10351010
" 'element_id': '1df8eeb8be847c3a1a7411e3be3e0396',\n",
10361011
" 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n",
10371012
" 'text': 'This is a test document to use for unit tests.'},\n",
10381013
" {'type': 'Title',\n",
1039-
" 'coordinate_system': None,\n",
1040-
" 'layout_width': None,\n",
1041-
" 'layout_height': None,\n",
10421014
" 'element_id': '9c218520320f238595f1fde74bdd137d',\n",
10431015
" 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n",
10441016
" 'text': 'Important points:'},\n",
10451017
" {'type': 'ListItem',\n",
1046-
" 'coordinate_system': None,\n",
1047-
" 'layout_width': None,\n",
1048-
" 'layout_height': None,\n",
10491018
" 'element_id': '39a3ae572581d0f1fe7511fd7b3aa414',\n",
10501019
" 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n",
10511020
" 'text': 'Hamburgers are delicious'},\n",
10521021
" {'type': 'ListItem',\n",
1053-
" 'coordinate_system': None,\n",
1054-
" 'layout_width': None,\n",
1055-
" 'layout_height': None,\n",
10561022
" 'element_id': 'fc1adcb8eaceac694e500a103f9f698f',\n",
10571023
" 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n",
10581024
" 'text': 'Dogs are the best'},\n",
10591025
" {'type': 'ListItem',\n",
1060-
" 'coordinate_system': None,\n",
1061-
" 'layout_width': None,\n",
1062-
" 'layout_height': None,\n",
10631026
" 'element_id': '0b61e826b1c4ab05750184da72b89f83',\n",
10641027
" 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n",
10651028
" 'text': 'I love fuzzy blankets'}]"

prepline_general/api/general.py

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -274,16 +274,27 @@ def pipeline_api(
274274
df = convert_to_dataframe(elements)
275275
df["filename"] = os.path.basename(filename)
276276
if not show_coordinates:
277-
df.drop(columns=["coordinates"], inplace=True)
277+
columns_to_drop = [
278+
col
279+
for col in [
280+
"coordinates_points",
281+
"coordinates_system",
282+
"coordinates_layout_width",
283+
"coordinates_layout_height",
284+
]
285+
if col in df.columns
286+
]
287+
if columns_to_drop:
288+
df.drop(columns=columns_to_drop, inplace=True)
278289

279290
return df.to_csv(index=False)
280291

281292
result = convert_to_isd(elements)
282293
for element in result:
283294
element["metadata"]["filename"] = os.path.basename(filename)
284295

285-
if not show_coordinates:
286-
del element["coordinates"]
296+
if not show_coordinates and "coordinates" in element["metadata"]:
297+
del element["metadata"]["coordinates"]
287298

288299
return result
289300

requirements/base.in

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,10 @@
1-
unstructured[local-inference]>=0.7.1
1+
unstructured[local-inference]>=0.8.1
22
unstructured-api-tools>=0.6.0
3+
pydantic<2.0.2
4+
# Pinning click due to a unicode issue in black
5+
# can remove after black drops support for Python 3.6
6+
# ref: https://github.com/psf/black/issues/2964
7+
click==8.1.3
38
ratelimit
49
requests
510
pypdf

0 commit comments

Comments
 (0)