|
814 | 814 | " df = convert_to_dataframe(elements)\n", |
815 | 815 | " df[\"filename\"] = os.path.basename(filename)\n", |
816 | 816 | " if not show_coordinates:\n", |
817 | | - " df.drop(columns=[\"coordinates\"], inplace=True)\n", |
| 817 | + " columns_to_drop = [col for col in [\"coordinates_points\", \"coordinates_system\", \"coordinates_layout_width\", \"coordinates_layout_height\",] if col in df.columns]\n", |
| 818 | + " if columns_to_drop:\n", |
| 819 | + " df.drop(columns=columns_to_drop, inplace=True)\n", |
818 | 820 | " \n", |
819 | 821 | " return df.to_csv(index=False)\n", |
820 | 822 | " \n", |
821 | 823 | " result = convert_to_isd(elements)\n", |
822 | 824 | " for element in result:\n", |
823 | 825 | " element['metadata']['filename'] = os.path.basename(filename)\n", |
824 | 826 | "\n", |
825 | | - " if not show_coordinates:\n", |
826 | | - " del element['coordinates']\n", |
| 827 | + " if not show_coordinates and \"coordinates\" in element[\"metadata\"]:\n", |
| 828 | + " del element[\"metadata\"][\"coordinates\"]\n", |
827 | 829 | " \n", |
828 | 830 | " return result" |
829 | 831 | ] |
|
849 | 851 | "data": { |
850 | 852 | "text/plain": [ |
851 | 853 | "[{'type': 'UncategorizedText',\n", |
852 | | - " 'coordinate_system': None,\n", |
853 | | - " 'layout_width': None,\n", |
854 | | - " 'layout_height': None,\n", |
855 | 854 | " 'element_id': 'db1ca22813f01feda8759ff04a844e56',\n", |
856 | 855 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
857 | 856 | " 'filetype': 'message/rfc822',\n", |
|
861 | 860 | " 'filename': 'family-day.eml'},\n", |
862 | 861 | " 'text': 'Hi All,'},\n", |
863 | 862 | " {'type': 'NarrativeText',\n", |
864 | | - " 'coordinate_system': None,\n", |
865 | | - " 'layout_width': None,\n", |
866 | | - " 'layout_height': None,\n", |
867 | 863 | " 'element_id': 'a663c393a5e143c01ef2bb5c98efa2c1',\n", |
868 | 864 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
869 | 865 | " 'filetype': 'message/rfc822',\n", |
|
873 | 869 | " 'filename': 'family-day.eml'},\n", |
874 | 870 | " 'text': 'Get excited for our first annual family day!\\xa0'},\n", |
875 | 871 | " {'type': 'NarrativeText',\n", |
876 | | - " 'coordinate_system': None,\n", |
877 | | - " 'layout_width': None,\n", |
878 | | - " 'layout_height': None,\n", |
879 | 872 | " 'element_id': 'ce65ca3bef59957d3f1c2bab5725c82f',\n", |
880 | 873 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
881 | 874 | " 'filetype': 'message/rfc822',\n", |
|
885 | 878 | " 'filename': 'family-day.eml'},\n", |
886 | 879 | " 'text': 'There will be face painting, a petting zoo, funnel cake and more.'},\n", |
887 | 880 | " {'type': 'NarrativeText',\n", |
888 | | - " 'coordinate_system': None,\n", |
889 | | - " 'layout_width': None,\n", |
890 | | - " 'layout_height': None,\n", |
891 | 881 | " 'element_id': 'd7bcf988af9f06042d83e25c531e5744',\n", |
892 | 882 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
893 | 883 | " 'filetype': 'message/rfc822',\n", |
|
897 | 887 | " 'filename': 'family-day.eml'},\n", |
898 | 888 | " 'text': 'Make sure to RSVP!'},\n", |
899 | 889 | " {'type': 'Title',\n", |
900 | | - " 'coordinate_system': None,\n", |
901 | | - " 'layout_width': None,\n", |
902 | | - " 'layout_height': None,\n", |
903 | 890 | " 'element_id': '5550577db69c2c8aabcd90979698120a',\n", |
904 | 891 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
905 | 892 | " 'filetype': 'message/rfc822',\n", |
|
909 | 896 | " 'filename': 'family-day.eml'},\n", |
910 | 897 | " 'text': 'Best.'},\n", |
911 | 898 | " {'type': 'Title',\n", |
912 | | - " 'coordinate_system': None,\n", |
913 | | - " 'layout_width': None,\n", |
914 | | - " 'layout_height': None,\n", |
915 | 899 | " 'element_id': 'ca1c571d993b6c1ed8ef56a06c16ba22',\n", |
916 | 900 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
917 | 901 | " 'filetype': 'message/rfc822',\n", |
|
921 | 905 | " 'filename': 'family-day.eml'},\n", |
922 | 906 | " 'text': 'Mallori Harrell'},\n", |
923 | 907 | " {'type': 'Title',\n", |
924 | | - " 'coordinate_system': None,\n", |
925 | | - " 'layout_width': None,\n", |
926 | | - " 'layout_height': None,\n", |
927 | 908 | " 'element_id': 'd5b612de8cd918addd9569b0255b65b2',\n", |
928 | 909 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
929 | 910 | " 'filetype': 'message/rfc822',\n", |
|
933 | 914 | " 'filename': 'family-day.eml'},\n", |
934 | 915 | " 'text': 'Unstructured Technologies'},\n", |
935 | 916 | " {'type': 'Title',\n", |
936 | | - " 'coordinate_system': None,\n", |
937 | | - " 'layout_width': None,\n", |
938 | | - " 'layout_height': None,\n", |
939 | 917 | " 'element_id': '2e0b9e8ee04b9594a9c26d8535b818ff',\n", |
940 | 918 | " 'metadata': {'date': '2022-12-21T10:28:53-06:00',\n", |
941 | 919 | " 'filetype': 'message/rfc822',\n", |
|
977 | 955 | { |
978 | 956 | "data": { |
979 | 957 | "text/plain": [ |
980 | | - "'type,text,element_id,coordinate_system,layout_width,layout_height,filename,page_number,url,sent_from,sent_to,subject,sender\\nUncategorizedText,\"Hi All,\",db1ca22813f01feda8759ff04a844e56,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nNarrativeText,Get excited for our first annual family day!\\xa0,a663c393a5e143c01ef2bb5c98efa2c1,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nNarrativeText,\"There will be face painting, a petting zoo, funnel cake and more.\",ce65ca3bef59957d3f1c2bab5725c82f,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nNarrativeText,Make sure to RSVP!,d7bcf988af9f06042d83e25c531e5744,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nTitle,Best.,5550577db69c2c8aabcd90979698120a,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nTitle,Mallori Harrell,ca1c571d993b6c1ed8ef56a06c16ba22,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nTitle,Unstructured Technologies,d5b612de8cd918addd9569b0255b65b2,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\nTitle,Data Scientist,2e0b9e8ee04b9594a9c26d8535b818ff,,,,family-day.eml,,,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>\\n'" |
| 958 | + "'type,text,element_id,date,filetype,sent_from,sent_to,subject,sender,filename\\nUncategorizedText,\"Hi All,\",db1ca22813f01feda8759ff04a844e56,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nNarrativeText,Get excited for our first annual family day!\\xa0,a663c393a5e143c01ef2bb5c98efa2c1,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nNarrativeText,\"There will be face painting, a petting zoo, funnel cake and more.\",ce65ca3bef59957d3f1c2bab5725c82f,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nNarrativeText,Make sure to RSVP!,d7bcf988af9f06042d83e25c531e5744,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nTitle,Best.,5550577db69c2c8aabcd90979698120a,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nTitle,Mallori Harrell,ca1c571d993b6c1ed8ef56a06c16ba22,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nTitle,Unstructured Technologies,d5b612de8cd918addd9569b0255b65b2,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\nTitle,Data Scientist,2e0b9e8ee04b9594a9c26d8535b818ff,2022-12-21T10:28:53-06:00,message/rfc822,[\\'Mallori Harrell < [email protected]>\\'],[\\'Mallori Harrell < [email protected]>\\'],Family Day,Mallori Harrell < [email protected]>,family-day.eml\\n'" |
981 | 959 | ] |
982 | 960 | }, |
983 | 961 | "execution_count": null, |
|
1029 | 1007 | "data": { |
1030 | 1008 | "text/plain": [ |
1031 | 1009 | "[{'type': 'NarrativeText',\n", |
1032 | | - " 'coordinate_system': None,\n", |
1033 | | - " 'layout_width': None,\n", |
1034 | | - " 'layout_height': None,\n", |
1035 | 1010 | " 'element_id': '1df8eeb8be847c3a1a7411e3be3e0396',\n", |
1036 | 1011 | " 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n", |
1037 | 1012 | " 'text': 'This is a test document to use for unit tests.'},\n", |
1038 | 1013 | " {'type': 'Title',\n", |
1039 | | - " 'coordinate_system': None,\n", |
1040 | | - " 'layout_width': None,\n", |
1041 | | - " 'layout_height': None,\n", |
1042 | 1014 | " 'element_id': '9c218520320f238595f1fde74bdd137d',\n", |
1043 | 1015 | " 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n", |
1044 | 1016 | " 'text': 'Important points:'},\n", |
1045 | 1017 | " {'type': 'ListItem',\n", |
1046 | | - " 'coordinate_system': None,\n", |
1047 | | - " 'layout_width': None,\n", |
1048 | | - " 'layout_height': None,\n", |
1049 | 1018 | " 'element_id': '39a3ae572581d0f1fe7511fd7b3aa414',\n", |
1050 | 1019 | " 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n", |
1051 | 1020 | " 'text': 'Hamburgers are delicious'},\n", |
1052 | 1021 | " {'type': 'ListItem',\n", |
1053 | | - " 'coordinate_system': None,\n", |
1054 | | - " 'layout_width': None,\n", |
1055 | | - " 'layout_height': None,\n", |
1056 | 1022 | " 'element_id': 'fc1adcb8eaceac694e500a103f9f698f',\n", |
1057 | 1023 | " 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n", |
1058 | 1024 | " 'text': 'Dogs are the best'},\n", |
1059 | 1025 | " {'type': 'ListItem',\n", |
1060 | | - " 'coordinate_system': None,\n", |
1061 | | - " 'layout_width': None,\n", |
1062 | | - " 'layout_height': None,\n", |
1063 | 1026 | " 'element_id': '0b61e826b1c4ab05750184da72b89f83',\n", |
1064 | 1027 | " 'metadata': {'filetype': 'text/plain', 'filename': 'fake-text.txt'},\n", |
1065 | 1028 | " 'text': 'I love fuzzy blankets'}]" |
|
0 commit comments