Skip to content

Commit 4762fab

Browse files
committed
Change the format for images parser
1 parent d7d3021 commit 4762fab

File tree

3 files changed

+102
-74
lines changed
  • docs/docs/integrations/document_loaders
  • libs/community
    • langchain_community/document_loaders/parsers
    • tests/integration_tests/document_loaders/parsers

3 files changed

+102
-74
lines changed

docs/docs/integrations/document_loaders/pymupdf.ipynb

Lines changed: 53 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,8 @@
4545
"cell_type": "code",
4646
"metadata": {
4747
"ExecuteTime": {
48-
"end_time": "2025-01-13T08:02:40.332853Z",
49-
"start_time": "2025-01-13T08:02:40.330328Z"
48+
"end_time": "2025-01-14T10:12:55.833836Z",
49+
"start_time": "2025-01-14T10:12:55.831595Z"
5050
}
5151
},
5252
"source": [
@@ -69,8 +69,8 @@
6969
"cell_type": "code",
7070
"metadata": {
7171
"ExecuteTime": {
72-
"end_time": "2025-01-13T08:02:43.828958Z",
73-
"start_time": "2025-01-13T08:02:40.618406Z"
72+
"end_time": "2025-01-14T10:12:57.666515Z",
73+
"start_time": "2025-01-14T10:12:55.886864Z"
7474
}
7575
},
7676
"source": "%pip install -qU langchain_community pymupdf",
@@ -98,8 +98,8 @@
9898
"cell_type": "code",
9999
"metadata": {
100100
"ExecuteTime": {
101-
"end_time": "2025-01-13T08:02:44.972549Z",
102-
"start_time": "2025-01-13T08:02:43.854127Z"
101+
"end_time": "2025-01-14T10:12:58.499Z",
102+
"start_time": "2025-01-14T10:12:57.812164Z"
103103
}
104104
},
105105
"source": [
@@ -122,8 +122,8 @@
122122
"cell_type": "code",
123123
"metadata": {
124124
"ExecuteTime": {
125-
"end_time": "2025-01-13T08:02:45.375168Z",
126-
"start_time": "2025-01-13T08:02:45.017937Z"
125+
"end_time": "2025-01-14T10:13:00.572764Z",
126+
"start_time": "2025-01-14T10:13:00.231470Z"
127127
}
128128
},
129129
"source": [
@@ -148,8 +148,8 @@
148148
"cell_type": "code",
149149
"metadata": {
150150
"ExecuteTime": {
151-
"end_time": "2025-01-13T08:02:45.451265Z",
152-
"start_time": "2025-01-13T08:02:45.448398Z"
151+
"end_time": "2025-01-14T10:13:01.816297Z",
152+
"start_time": "2025-01-14T10:13:01.813223Z"
153153
}
154154
},
155155
"source": [
@@ -192,8 +192,8 @@
192192
"cell_type": "code",
193193
"metadata": {
194194
"ExecuteTime": {
195-
"end_time": "2025-01-13T08:02:45.690351Z",
196-
"start_time": "2025-01-13T08:02:45.593918Z"
195+
"end_time": "2025-01-14T10:13:04.054290Z",
196+
"start_time": "2025-01-14T10:13:03.994306Z"
197197
}
198198
},
199199
"source": [
@@ -225,8 +225,8 @@
225225
"cell_type": "code",
226226
"metadata": {
227227
"ExecuteTime": {
228-
"end_time": "2025-01-13T08:02:45.789304Z",
229-
"start_time": "2025-01-13T08:02:45.786254Z"
228+
"end_time": "2025-01-14T10:13:05.083279Z",
229+
"start_time": "2025-01-14T10:13:05.080418Z"
230230
}
231231
},
232232
"source": [
@@ -305,8 +305,8 @@
305305
"cell_type": "code",
306306
"metadata": {
307307
"ExecuteTime": {
308-
"end_time": "2025-01-13T08:02:51.025269Z",
309-
"start_time": "2025-01-13T08:02:50.926027Z"
308+
"end_time": "2025-01-14T10:13:10.327254Z",
309+
"start_time": "2025-01-14T10:13:10.278837Z"
310310
}
311311
},
312312
"source": [
@@ -361,8 +361,8 @@
361361
"cell_type": "code",
362362
"metadata": {
363363
"ExecuteTime": {
364-
"end_time": "2025-01-13T08:02:56.217346Z",
365-
"start_time": "2025-01-13T08:02:56.110305Z"
364+
"end_time": "2025-01-14T10:13:13.787181Z",
365+
"start_time": "2025-01-14T10:13:13.733837Z"
366366
}
367367
},
368368
"source": [
@@ -416,8 +416,8 @@
416416
"cell_type": "code",
417417
"metadata": {
418418
"ExecuteTime": {
419-
"end_time": "2025-01-13T08:03:06.848265Z",
420-
"start_time": "2025-01-13T08:03:06.796198Z"
419+
"end_time": "2025-01-14T10:13:17.262238Z",
420+
"start_time": "2025-01-14T10:13:17.211110Z"
421421
}
422422
},
423423
"source": [
@@ -573,8 +573,8 @@
573573
"cell_type": "code",
574574
"metadata": {
575575
"ExecuteTime": {
576-
"end_time": "2025-01-13T08:03:12.420680Z",
577-
"start_time": "2025-01-13T08:03:10.602900Z"
576+
"end_time": "2025-01-14T10:13:23.527167Z",
577+
"start_time": "2025-01-14T10:13:22.135019Z"
578578
}
579579
},
580580
"source": [
@@ -595,8 +595,8 @@
595595
"cell_type": "code",
596596
"metadata": {
597597
"ExecuteTime": {
598-
"end_time": "2025-01-13T08:03:37.046948Z",
599-
"start_time": "2025-01-13T08:03:12.566199Z"
598+
"end_time": "2025-01-14T10:13:57.393470Z",
599+
"start_time": "2025-01-14T10:13:30.062053Z"
600600
}
601601
},
602602
"source": [
@@ -606,7 +606,7 @@
606606
" \"./example_data/layout-parser-paper.pdf\",\n",
607607
" mode=\"page\",\n",
608608
" extract_images=True,\n",
609-
" images_parser=RapidOCRBlobParser(format=\"html\"),\n",
609+
" images_parser=RapidOCRBlobParser(format=\"html-img\"),\n",
610610
")\n",
611611
"docs = loader.load()\n",
612612
"\n",
@@ -708,8 +708,8 @@
708708
"cell_type": "code",
709709
"metadata": {
710710
"ExecuteTime": {
711-
"end_time": "2025-01-13T08:03:41.280752Z",
712-
"start_time": "2025-01-13T08:03:39.816022Z"
711+
"end_time": "2025-01-14T10:16:41.309602Z",
712+
"start_time": "2025-01-14T10:16:39.905135Z"
713713
}
714714
},
715715
"source": [
@@ -730,8 +730,8 @@
730730
"cell_type": "code",
731731
"metadata": {
732732
"ExecuteTime": {
733-
"end_time": "2025-01-13T08:03:49.305347Z",
734-
"start_time": "2025-01-13T08:03:41.439083Z"
733+
"end_time": "2025-01-14T10:16:51.234312Z",
734+
"start_time": "2025-01-14T10:16:42.503449Z"
735735
}
736736
},
737737
"source": [
@@ -835,8 +835,8 @@
835835
"cell_type": "code",
836836
"metadata": {
837837
"ExecuteTime": {
838-
"end_time": "2025-01-13T08:03:53.298840Z",
839-
"start_time": "2025-01-13T08:03:51.394025Z"
838+
"end_time": "2025-01-14T10:16:54.777595Z",
839+
"start_time": "2025-01-14T10:16:53.206882Z"
840840
}
841841
},
842842
"source": [
@@ -857,8 +857,8 @@
857857
"cell_type": "code",
858858
"metadata": {
859859
"ExecuteTime": {
860-
"end_time": "2025-01-13T08:03:53.371158Z",
861-
"start_time": "2025-01-13T08:03:53.341852Z"
860+
"end_time": "2025-01-14T10:16:54.884726Z",
861+
"start_time": "2025-01-14T10:16:54.855413Z"
862862
}
863863
},
864864
"source": [
@@ -886,8 +886,8 @@
886886
"cell_type": "code",
887887
"metadata": {
888888
"ExecuteTime": {
889-
"end_time": "2025-01-13T08:03:53.395952Z",
890-
"start_time": "2025-01-13T08:03:53.393425Z"
889+
"end_time": "2025-01-14T10:16:55.992301Z",
890+
"start_time": "2025-01-14T10:16:55.988357Z"
891891
}
892892
},
893893
"source": [
@@ -903,8 +903,8 @@
903903
"cell_type": "code",
904904
"metadata": {
905905
"ExecuteTime": {
906-
"end_time": "2025-01-13T08:04:44.787666Z",
907-
"start_time": "2025-01-13T08:03:54.545139Z"
906+
"end_time": "2025-01-14T10:18:04.910544Z",
907+
"start_time": "2025-01-14T10:16:58.651103Z"
908908
}
909909
},
910910
"source": [
@@ -916,7 +916,7 @@
916916
" mode=\"page\",\n",
917917
" extract_images=True,\n",
918918
" images_parser=LLMImageBlobParser(\n",
919-
" model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024), format=\"markdown\"\n",
919+
" model=ChatOpenAI(model=\"gpt-4o\", max_tokens=1024), format=\"markdown-link\"\n",
920920
" ),\n",
921921
")\n",
922922
"docs = loader.load()\n",
@@ -959,26 +959,18 @@
959959
"\n",
960960
"\n",
961961
"\n",
962-
"![Summary: Diagram illustrating the components of a layout system using coordinates, text blocks, and layout elements. It shows coordinate representations (intervals, rectangles, quadrilaterals) and text block features. The layout is a list of these elements. Includes transformation and operation APIs.\n",
962+
"![Diagram depicting the structure of layout elements in visualization processing. Upper section illustrates coordinate systems: x-interval, y-interval, rectangle, quadrilateral. Middle section introduces textblock: combining coordinates with extra features (Block Text, Block Type, Reading Order). Bottom section defines layout as a list of layout elements [coordinate1, textblock1, ..., textblock2, layout1\\\\]. Right side notes: \"The same transformation and operation APIs.\" Text: \n",
963963
"\n",
964-
"Extracted Text:\n",
965-
"- Coordinate\n",
966-
"- Coordinate\n",
967-
"- x-interval\n",
968-
"- y-interval\n",
969-
"- start\n",
970-
"- end\n",
971-
"- Rectangle\n",
972-
"- Quadrilateral\n",
973-
"- textblock\n",
974-
"- Coordinate\n",
975-
"- Extra features\n",
976-
"- Block Text\n",
977-
"- Block Type\n",
978-
"- Reading Order\n",
979-
"- layout\n",
980-
"- A list of the layout elements\n",
981-
"- The same transformation and operation APIs](#)\n"
964+
"Coordinate\n",
965+
"textblock\n",
966+
"Coordinate + Extra features\n",
967+
"Block Text\n",
968+
"Block Type\n",
969+
"Reading Order\n",
970+
"...\n",
971+
"layout\n",
972+
"A list of the layout elements\n",
973+
"The same transformation and operation APIs](#)\n"
982974
]
983975
}
984976
],
@@ -1002,8 +994,8 @@
1002994
"cell_type": "code",
1003995
"metadata": {
1004996
"ExecuteTime": {
1005-
"end_time": "2025-01-13T08:04:52.328048Z",
1006-
"start_time": "2025-01-13T08:04:51.152693Z"
997+
"end_time": "2025-01-14T10:18:16.140968Z",
998+
"start_time": "2025-01-14T10:18:14.888610Z"
1007999
}
10081000
},
10091001
"source": [
@@ -1112,8 +1104,8 @@
11121104
"cell_type": "code",
11131105
"metadata": {
11141106
"ExecuteTime": {
1115-
"end_time": "2025-01-13T08:04:55.110532Z",
1116-
"start_time": "2025-01-13T08:04:55.059126Z"
1107+
"end_time": "2025-01-14T10:18:17.672927Z",
1108+
"start_time": "2025-01-14T10:18:17.618741Z"
11171109
}
11181110
},
11191111
"source": [

0 commit comments

Comments
 (0)