|
45 | 45 | "cell_type": "code", |
46 | 46 | "metadata": { |
47 | 47 | "ExecuteTime": { |
48 | | - "end_time": "2025-01-13T08:02:40.332853Z", |
49 | | - "start_time": "2025-01-13T08:02:40.330328Z" |
| 48 | + "end_time": "2025-01-14T10:12:55.833836Z", |
| 49 | + "start_time": "2025-01-14T10:12:55.831595Z" |
50 | 50 | } |
51 | 51 | }, |
52 | 52 | "source": [ |
|
69 | 69 | "cell_type": "code", |
70 | 70 | "metadata": { |
71 | 71 | "ExecuteTime": { |
72 | | - "end_time": "2025-01-13T08:02:43.828958Z", |
73 | | - "start_time": "2025-01-13T08:02:40.618406Z" |
| 72 | + "end_time": "2025-01-14T10:12:57.666515Z", |
| 73 | + "start_time": "2025-01-14T10:12:55.886864Z" |
74 | 74 | } |
75 | 75 | }, |
76 | 76 | "source": "%pip install -qU langchain_community pymupdf", |
|
98 | 98 | "cell_type": "code", |
99 | 99 | "metadata": { |
100 | 100 | "ExecuteTime": { |
101 | | - "end_time": "2025-01-13T08:02:44.972549Z", |
102 | | - "start_time": "2025-01-13T08:02:43.854127Z" |
| 101 | + "end_time": "2025-01-14T10:12:58.499Z", |
| 102 | + "start_time": "2025-01-14T10:12:57.812164Z" |
103 | 103 | } |
104 | 104 | }, |
105 | 105 | "source": [ |
|
122 | 122 | "cell_type": "code", |
123 | 123 | "metadata": { |
124 | 124 | "ExecuteTime": { |
125 | | - "end_time": "2025-01-13T08:02:45.375168Z", |
126 | | - "start_time": "2025-01-13T08:02:45.017937Z" |
| 125 | + "end_time": "2025-01-14T10:13:00.572764Z", |
| 126 | + "start_time": "2025-01-14T10:13:00.231470Z" |
127 | 127 | } |
128 | 128 | }, |
129 | 129 | "source": [ |
|
148 | 148 | "cell_type": "code", |
149 | 149 | "metadata": { |
150 | 150 | "ExecuteTime": { |
151 | | - "end_time": "2025-01-13T08:02:45.451265Z", |
152 | | - "start_time": "2025-01-13T08:02:45.448398Z" |
| 151 | + "end_time": "2025-01-14T10:13:01.816297Z", |
| 152 | + "start_time": "2025-01-14T10:13:01.813223Z" |
153 | 153 | } |
154 | 154 | }, |
155 | 155 | "source": [ |
|
192 | 192 | "cell_type": "code", |
193 | 193 | "metadata": { |
194 | 194 | "ExecuteTime": { |
195 | | - "end_time": "2025-01-13T08:02:45.690351Z", |
196 | | - "start_time": "2025-01-13T08:02:45.593918Z" |
| 195 | + "end_time": "2025-01-14T10:13:04.054290Z", |
| 196 | + "start_time": "2025-01-14T10:13:03.994306Z" |
197 | 197 | } |
198 | 198 | }, |
199 | 199 | "source": [ |
|
225 | 225 | "cell_type": "code", |
226 | 226 | "metadata": { |
227 | 227 | "ExecuteTime": { |
228 | | - "end_time": "2025-01-13T08:02:45.789304Z", |
229 | | - "start_time": "2025-01-13T08:02:45.786254Z" |
| 228 | + "end_time": "2025-01-14T10:13:05.083279Z", |
| 229 | + "start_time": "2025-01-14T10:13:05.080418Z" |
230 | 230 | } |
231 | 231 | }, |
232 | 232 | "source": [ |
|
305 | 305 | "cell_type": "code", |
306 | 306 | "metadata": { |
307 | 307 | "ExecuteTime": { |
308 | | - "end_time": "2025-01-13T08:02:51.025269Z", |
309 | | - "start_time": "2025-01-13T08:02:50.926027Z" |
| 308 | + "end_time": "2025-01-14T10:13:10.327254Z", |
| 309 | + "start_time": "2025-01-14T10:13:10.278837Z" |
310 | 310 | } |
311 | 311 | }, |
312 | 312 | "source": [ |
|
361 | 361 | "cell_type": "code", |
362 | 362 | "metadata": { |
363 | 363 | "ExecuteTime": { |
364 | | - "end_time": "2025-01-13T08:02:56.217346Z", |
365 | | - "start_time": "2025-01-13T08:02:56.110305Z" |
| 364 | + "end_time": "2025-01-14T10:13:13.787181Z", |
| 365 | + "start_time": "2025-01-14T10:13:13.733837Z" |
366 | 366 | } |
367 | 367 | }, |
368 | 368 | "source": [ |
|
416 | 416 | "cell_type": "code", |
417 | 417 | "metadata": { |
418 | 418 | "ExecuteTime": { |
419 | | - "end_time": "2025-01-13T08:03:06.848265Z", |
420 | | - "start_time": "2025-01-13T08:03:06.796198Z" |
| 419 | + "end_time": "2025-01-14T10:13:17.262238Z", |
| 420 | + "start_time": "2025-01-14T10:13:17.211110Z" |
421 | 421 | } |
422 | 422 | }, |
423 | 423 | "source": [ |
|
573 | 573 | "cell_type": "code", |
574 | 574 | "metadata": { |
575 | 575 | "ExecuteTime": { |
576 | | - "end_time": "2025-01-13T08:03:12.420680Z", |
577 | | - "start_time": "2025-01-13T08:03:10.602900Z" |
| 576 | + "end_time": "2025-01-14T10:13:23.527167Z", |
| 577 | + "start_time": "2025-01-14T10:13:22.135019Z" |
578 | 578 | } |
579 | 579 | }, |
580 | 580 | "source": [ |
|
595 | 595 | "cell_type": "code", |
596 | 596 | "metadata": { |
597 | 597 | "ExecuteTime": { |
598 | | - "end_time": "2025-01-13T08:03:37.046948Z", |
599 | | - "start_time": "2025-01-13T08:03:12.566199Z" |
| 598 | + "end_time": "2025-01-14T10:13:57.393470Z", |
| 599 | + "start_time": "2025-01-14T10:13:30.062053Z" |
600 | 600 | } |
601 | 601 | }, |
602 | 602 | "source": [ |
|
606 | 606 | " \"./example_data/layout-parser-paper.pdf\",\n", |
607 | 607 | " mode=\"page\",\n", |
608 | 608 | " extract_images=True,\n", |
609 | | - " images_parser=RapidOCRBlobParser(format=\"html\"),\n", |
| 609 | + " images_parser=RapidOCRBlobParser(format=\"html-img\"),\n", |
610 | 610 | ")\n", |
611 | 611 | "docs = loader.load()\n", |
612 | 612 | "\n", |
|
708 | 708 | "cell_type": "code", |
709 | 709 | "metadata": { |
710 | 710 | "ExecuteTime": { |
711 | | - "end_time": "2025-01-13T08:03:41.280752Z", |
712 | | - "start_time": "2025-01-13T08:03:39.816022Z" |
| 711 | + "end_time": "2025-01-14T10:16:41.309602Z", |
| 712 | + "start_time": "2025-01-14T10:16:39.905135Z" |
713 | 713 | } |
714 | 714 | }, |
715 | 715 | "source": [ |
|
730 | 730 | "cell_type": "code", |
731 | 731 | "metadata": { |
732 | 732 | "ExecuteTime": { |
733 | | - "end_time": "2025-01-13T08:03:49.305347Z", |
734 | | - "start_time": "2025-01-13T08:03:41.439083Z" |
| 733 | + "end_time": "2025-01-14T10:16:51.234312Z", |
| 734 | + "start_time": "2025-01-14T10:16:42.503449Z" |
735 | 735 | } |
736 | 736 | }, |
737 | 737 | "source": [ |
|
835 | 835 | "cell_type": "code", |
836 | 836 | "metadata": { |
837 | 837 | "ExecuteTime": { |
838 | | - "end_time": "2025-01-13T08:03:53.298840Z", |
839 | | - "start_time": "2025-01-13T08:03:51.394025Z" |
| 838 | + "end_time": "2025-01-14T10:16:54.777595Z", |
| 839 | + "start_time": "2025-01-14T10:16:53.206882Z" |
840 | 840 | } |
841 | 841 | }, |
842 | 842 | "source": [ |
|
857 | 857 | "cell_type": "code", |
858 | 858 | "metadata": { |
859 | 859 | "ExecuteTime": { |
860 | | - "end_time": "2025-01-13T08:03:53.371158Z", |
861 | | - "start_time": "2025-01-13T08:03:53.341852Z" |
| 860 | + "end_time": "2025-01-14T10:16:54.884726Z", |
| 861 | + "start_time": "2025-01-14T10:16:54.855413Z" |
862 | 862 | } |
863 | 863 | }, |
864 | 864 | "source": [ |
|
886 | 886 | "cell_type": "code", |
887 | 887 | "metadata": { |
888 | 888 | "ExecuteTime": { |
889 | | - "end_time": "2025-01-13T08:03:53.395952Z", |
890 | | - "start_time": "2025-01-13T08:03:53.393425Z" |
| 889 | + "end_time": "2025-01-14T10:16:55.992301Z", |
| 890 | + "start_time": "2025-01-14T10:16:55.988357Z" |
891 | 891 | } |
892 | 892 | }, |
893 | 893 | "source": [ |
|
903 | 903 | "cell_type": "code", |
904 | 904 | "metadata": { |
905 | 905 | "ExecuteTime": { |
906 | | - "end_time": "2025-01-13T08:04:44.787666Z", |
907 | | - "start_time": "2025-01-13T08:03:54.545139Z" |
| 906 | + "end_time": "2025-01-14T10:18:04.910544Z", |
| 907 | + "start_time": "2025-01-14T10:16:58.651103Z" |
908 | 908 | } |
909 | 909 | }, |
910 | 910 | "source": [ |
|
916 | 916 | " mode=\"page\",\n", |
917 | 917 | " extract_images=True,\n", |
918 | 918 | " images_parser=LLMImageBlobParser(\n", |
919 | | - " model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024), format=\"markdown\"\n", |
| 919 | + " model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024), format=\"markdown-link\"\n", |
920 | 920 | " ),\n", |
921 | 921 | ")\n", |
922 | 922 | "docs = loader.load()\n", |
|
959 | 959 | "\n", |
960 | 960 | "\n", |
961 | 961 | "\n", |
962 | | - "![Summary: Diagram illustrating the components of a layout system using coordinates, text blocks, and layout elements. It shows coordinate representations (intervals, rectangles, quadrilaterals) and text block features. The layout is a list of these elements. Includes transformation and operation APIs.\n", |
| 962 | + "![Diagram depicting the structure of layout elements in visualization processing. Upper section illustrates coordinate systems: x-interval, y-interval, rectangle, quadrilateral. Middle section introduces textblock: combining coordinates with extra features (Block Text, Block Type, Reading Order). Bottom section defines layout as a list of layout elements [coordinate1, textblock1, ..., textblock2, layout1\\\\]. Right side notes: \"The same transformation and operation APIs.\" Text: \n", |
963 | 963 | "\n", |
964 | | - "Extracted Text:\n", |
965 | | - "- Coordinate\n", |
966 | | - "- Coordinate\n", |
967 | | - "- x-interval\n", |
968 | | - "- y-interval\n", |
969 | | - "- start\n", |
970 | | - "- end\n", |
971 | | - "- Rectangle\n", |
972 | | - "- Quadrilateral\n", |
973 | | - "- textblock\n", |
974 | | - "- Coordinate\n", |
975 | | - "- Extra features\n", |
976 | | - "- Block Text\n", |
977 | | - "- Block Type\n", |
978 | | - "- Reading Order\n", |
979 | | - "- layout\n", |
980 | | - "- A list of the layout elements\n", |
981 | | - "- The same transformation and operation APIs](#)\n" |
| 964 | + "Coordinate\n", |
| 965 | + "textblock\n", |
| 966 | + "Coordinate + Extra features\n", |
| 967 | + "Block Text\n", |
| 968 | + "Block Type\n", |
| 969 | + "Reading Order\n", |
| 970 | + "...\n", |
| 971 | + "layout\n", |
| 972 | + "A list of the layout elements\n", |
| 973 | + "The same transformation and operation APIs](#)\n" |
982 | 974 | ] |
983 | 975 | } |
984 | 976 | ], |
|
1002 | 994 | "cell_type": "code", |
1003 | 995 | "metadata": { |
1004 | 996 | "ExecuteTime": { |
1005 | | - "end_time": "2025-01-13T08:04:52.328048Z", |
1006 | | - "start_time": "2025-01-13T08:04:51.152693Z" |
| 997 | + "end_time": "2025-01-14T10:18:16.140968Z", |
| 998 | + "start_time": "2025-01-14T10:18:14.888610Z" |
1007 | 999 | } |
1008 | 1000 | }, |
1009 | 1001 | "source": [ |
|
1112 | 1104 | "cell_type": "code", |
1113 | 1105 | "metadata": { |
1114 | 1106 | "ExecuteTime": { |
1115 | | - "end_time": "2025-01-13T08:04:55.110532Z", |
1116 | | - "start_time": "2025-01-13T08:04:55.059126Z" |
| 1107 | + "end_time": "2025-01-14T10:18:17.672927Z", |
| 1108 | + "start_time": "2025-01-14T10:18:17.618741Z" |
1117 | 1109 | } |
1118 | 1110 | }, |
1119 | 1111 | "source": [ |
|
0 commit comments