Skip to content

Commit b434d66

Browse files
committed
Refactor pypdf
1 parent 1117d72 commit b434d66

File tree

6 files changed

+96
-98
lines changed

6 files changed

+96
-98
lines changed

docs/docs/integrations/document_loaders/pypdfdirectory.ipynb

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -38,8 +38,8 @@
3838
"cell_type": "code",
3939
"metadata": {
4040
"ExecuteTime": {
41-
"end_time": "2025-01-16T09:36:05.879020Z",
42-
"start_time": "2025-01-16T09:36:05.873373Z"
41+
"end_time": "2025-01-21T08:00:08.878423Z",
42+
"start_time": "2025-01-21T08:00:08.876042Z"
4343
}
4444
},
4545
"source": [
@@ -61,8 +61,8 @@
6161
{
6262
"metadata": {
6363
"ExecuteTime": {
64-
"end_time": "2025-01-16T09:36:09.110520Z",
65-
"start_time": "2025-01-16T09:36:07.189960Z"
64+
"end_time": "2025-01-21T08:00:12.003718Z",
65+
"start_time": "2025-01-21T08:00:10.291617Z"
6666
}
6767
},
6868
"cell_type": "code",
@@ -90,8 +90,8 @@
9090
{
9191
"metadata": {
9292
"ExecuteTime": {
93-
"end_time": "2025-01-16T09:36:11.460830Z",
94-
"start_time": "2025-01-16T09:36:10.759782Z"
93+
"end_time": "2025-01-21T08:00:18.512061Z",
94+
"start_time": "2025-01-21T08:00:17.313969Z"
9595
}
9696
},
9797
"cell_type": "code",
@@ -117,8 +117,8 @@
117117
"cell_type": "code",
118118
"metadata": {
119119
"ExecuteTime": {
120-
"end_time": "2025-01-16T09:36:13.032254Z",
121-
"start_time": "2025-01-16T09:36:12.666040Z"
120+
"end_time": "2025-01-21T08:00:23.549752Z",
121+
"start_time": "2025-01-21T08:00:23.129010Z"
122122
}
123123
},
124124
"source": [
@@ -143,8 +143,8 @@
143143
"cell_type": "code",
144144
"metadata": {
145145
"ExecuteTime": {
146-
"end_time": "2025-01-16T09:36:14.035117Z",
147-
"start_time": "2025-01-16T09:36:14.031817Z"
146+
"end_time": "2025-01-21T08:00:26.612346Z",
147+
"start_time": "2025-01-21T08:00:26.609051Z"
148148
}
149149
},
150150
"source": [
@@ -172,8 +172,8 @@
172172
"cell_type": "code",
173173
"metadata": {
174174
"ExecuteTime": {
175-
"end_time": "2025-01-16T09:36:15.803314Z",
176-
"start_time": "2025-01-16T09:36:15.492399Z"
175+
"end_time": "2025-01-21T08:00:30.251598Z",
176+
"start_time": "2025-01-21T08:00:29.972141Z"
177177
}
178178
},
179179
"source": [

docs/docs/integrations/document_loaders/pypdfloader.ipynb

Lines changed: 69 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -43,8 +43,8 @@
4343
"cell_type": "code",
4444
"metadata": {
4545
"ExecuteTime": {
46-
"end_time": "2025-01-17T12:10:57.550785Z",
47-
"start_time": "2025-01-17T12:10:57.547909Z"
46+
"end_time": "2025-01-21T08:01:08.825630Z",
47+
"start_time": "2025-01-21T08:01:08.823315Z"
4848
}
4949
},
5050
"source": [
@@ -67,8 +67,8 @@
6767
"cell_type": "code",
6868
"metadata": {
6969
"ExecuteTime": {
70-
"end_time": "2025-01-17T12:11:00.890490Z",
71-
"start_time": "2025-01-17T12:10:58.790382Z"
70+
"end_time": "2025-01-21T08:01:10.997240Z",
71+
"start_time": "2025-01-21T08:01:09.529997Z"
7272
}
7373
},
7474
"source": "%pip install -qU langchain_community pypdf",
@@ -96,8 +96,8 @@
9696
"cell_type": "code",
9797
"metadata": {
9898
"ExecuteTime": {
99-
"end_time": "2025-01-17T12:11:03.023257Z",
100-
"start_time": "2025-01-17T12:11:01.886241Z"
99+
"end_time": "2025-01-21T08:01:14.685958Z",
100+
"start_time": "2025-01-21T08:01:13.653438Z"
101101
}
102102
},
103103
"source": [
@@ -120,8 +120,8 @@
120120
"cell_type": "code",
121121
"metadata": {
122122
"ExecuteTime": {
123-
"end_time": "2025-01-17T12:11:04.645734Z",
124-
"start_time": "2025-01-17T12:11:04.252405Z"
123+
"end_time": "2025-01-21T08:01:17.433959Z",
124+
"start_time": "2025-01-21T08:01:17.080724Z"
125125
}
126126
},
127127
"source": [
@@ -146,8 +146,8 @@
146146
"cell_type": "code",
147147
"metadata": {
148148
"ExecuteTime": {
149-
"end_time": "2025-01-17T12:11:04.968142Z",
150-
"start_time": "2025-01-17T12:11:04.961721Z"
149+
"end_time": "2025-01-21T08:01:18.619845Z",
150+
"start_time": "2025-01-21T08:01:18.615643Z"
151151
}
152152
},
153153
"source": [
@@ -191,8 +191,8 @@
191191
"cell_type": "code",
192192
"metadata": {
193193
"ExecuteTime": {
194-
"end_time": "2025-01-17T12:11:06.736178Z",
195-
"start_time": "2025-01-17T12:11:06.436295Z"
194+
"end_time": "2025-01-21T08:01:20.450806Z",
195+
"start_time": "2025-01-21T08:01:20.176333Z"
196196
}
197197
},
198198
"source": [
@@ -224,8 +224,8 @@
224224
"cell_type": "code",
225225
"metadata": {
226226
"ExecuteTime": {
227-
"end_time": "2025-01-17T12:11:07.106945Z",
228-
"start_time": "2025-01-17T12:11:07.104570Z"
227+
"end_time": "2025-01-21T08:01:21.267444Z",
228+
"start_time": "2025-01-21T08:01:21.263726Z"
229229
}
230230
},
231231
"source": [
@@ -300,8 +300,8 @@
300300
"cell_type": "code",
301301
"metadata": {
302302
"ExecuteTime": {
303-
"end_time": "2025-01-17T12:11:10.080152Z",
304-
"start_time": "2025-01-17T12:11:09.712079Z"
303+
"end_time": "2025-01-21T08:01:28.128153Z",
304+
"start_time": "2025-01-21T08:01:27.823798Z"
305305
}
306306
},
307307
"source": [
@@ -353,8 +353,8 @@
353353
"cell_type": "code",
354354
"metadata": {
355355
"ExecuteTime": {
356-
"end_time": "2025-01-17T12:11:12.235712Z",
357-
"start_time": "2025-01-17T12:11:11.863115Z"
356+
"end_time": "2025-01-21T08:01:31.794895Z",
357+
"start_time": "2025-01-21T08:01:31.470806Z"
358358
}
359359
},
360360
"source": [
@@ -404,8 +404,8 @@
404404
"cell_type": "code",
405405
"metadata": {
406406
"ExecuteTime": {
407-
"end_time": "2025-01-17T12:11:14.448661Z",
408-
"start_time": "2025-01-17T12:11:14.168617Z"
407+
"end_time": "2025-01-21T08:01:35.848808Z",
408+
"start_time": "2025-01-21T08:01:35.575903Z"
409409
}
410410
},
411411
"source": [
@@ -551,8 +551,8 @@
551551
"cell_type": "code",
552552
"metadata": {
553553
"ExecuteTime": {
554-
"end_time": "2025-01-17T12:11:18.808536Z",
555-
"start_time": "2025-01-17T12:11:17.280579Z"
554+
"end_time": "2025-01-21T08:01:40.692855Z",
555+
"start_time": "2025-01-21T08:01:39.293791Z"
556556
}
557557
},
558558
"source": [
@@ -572,8 +572,8 @@
572572
{
573573
"metadata": {
574574
"ExecuteTime": {
575-
"end_time": "2025-01-17T12:11:48.290690Z",
576-
"start_time": "2025-01-17T12:11:18.875353Z"
575+
"end_time": "2025-01-21T08:02:07.273962Z",
576+
"start_time": "2025-01-21T08:01:42.848244Z"
577577
}
578578
},
579579
"cell_type": "code",
@@ -679,8 +679,8 @@
679679
"cell_type": "code",
680680
"metadata": {
681681
"ExecuteTime": {
682-
"end_time": "2025-01-17T12:11:52.817982Z",
683-
"start_time": "2025-01-17T12:11:51.301231Z"
682+
"end_time": "2025-01-21T08:02:12.070378Z",
683+
"start_time": "2025-01-21T08:02:10.696635Z"
684684
}
685685
},
686686
"source": [
@@ -700,8 +700,8 @@
700700
{
701701
"metadata": {
702702
"ExecuteTime": {
703-
"end_time": "2025-01-17T12:12:04.817442Z",
704-
"start_time": "2025-01-17T12:11:52.884001Z"
703+
"end_time": "2025-01-21T08:02:21.712219Z",
704+
"start_time": "2025-01-21T08:02:12.081700Z"
705705
}
706706
},
707707
"cell_type": "code",
@@ -801,8 +801,8 @@
801801
"cell_type": "code",
802802
"metadata": {
803803
"ExecuteTime": {
804-
"end_time": "2025-01-17T12:12:07.867340Z",
805-
"start_time": "2025-01-17T12:12:06.284156Z"
804+
"end_time": "2025-01-21T08:02:25.912928Z",
805+
"start_time": "2025-01-21T08:02:24.324014Z"
806806
}
807807
},
808808
"source": [
@@ -823,8 +823,8 @@
823823
"cell_type": "code",
824824
"metadata": {
825825
"ExecuteTime": {
826-
"end_time": "2025-01-17T12:12:08.590038Z",
827-
"start_time": "2025-01-17T12:12:08.558666Z"
826+
"end_time": "2025-01-21T08:02:28.494996Z",
827+
"start_time": "2025-01-21T08:02:28.468181Z"
828828
}
829829
},
830830
"source": [
@@ -852,8 +852,8 @@
852852
"cell_type": "code",
853853
"metadata": {
854854
"ExecuteTime": {
855-
"end_time": "2025-01-17T12:12:10.481853Z",
856-
"start_time": "2025-01-17T12:12:10.479511Z"
855+
"end_time": "2025-01-21T08:02:29.318093Z",
856+
"start_time": "2025-01-21T08:02:29.314654Z"
857857
}
858858
},
859859
"source": [
@@ -868,8 +868,8 @@
868868
{
869869
"metadata": {
870870
"ExecuteTime": {
871-
"end_time": "2025-01-17T12:13:19.467357Z",
872-
"start_time": "2025-01-17T12:12:11.684641Z"
871+
"end_time": "2025-01-21T08:05:00.352337Z",
872+
"start_time": "2025-01-21T08:02:30.723099Z"
873873
}
874874
},
875875
"cell_type": "code",
@@ -921,43 +921,38 @@
921921
"\n",
922922
"\n",
923923
"\n",
924-
"![**Image Summary for Retrieval**: Diagram showing the components of document layout analysis using coordinates and text blocks. Includes elements like intervals, rectangles, quadrilaterals, and extra features like block text and order. It illustrates combining coordinates and text features to form a layout structure.\n",
924+
"![**Image Summary:** \n",
925+
"Diagram explaining coordinate systems and layout elements with labels for coordinate intervals, rectangle, quadrilateral, textblock with extra features, and layout list. Includes transformation and operation APIs.\n",
925926
"\n",
926-
"**Extracted Text**:\n",
927-
"\n",
928-
"```\n",
929-
"Coordinate\n",
930-
"Coordinate\n",
931-
"x-interval\n",
932-
"start\n",
933-
"start\n",
934-
"end\n",
935-
"end\n",
936-
"Rectangle\n",
937-
"(x1, y1)\n",
938-
"Coordinate \n",
939-
"+x-interval\n",
940-
"Extra features\n",
941-
"(x2, y2)\n",
942-
"Quadrilateral\n",
943-
"(x1, y1)\n",
944-
"Block\n",
945-
"Block\n",
946-
"Reading\n",
947-
"Text\n",
948-
"Type\n",
949-
"Order\n",
950-
"\n",
951-
"layout\n",
952-
"textblock\n",
953-
"Coordinate \n",
954-
"Extra features\n",
955-
"List of the layout elements\n",
956-
"(x2, y2)\n",
957-
"(x4, y4)\n",
958-
"(x3, y3)\n",
959-
"The same transformation and operation APIs\n",
960-
"```](#)\n"
927+
"**Extracted Text:** \n",
928+
"Coordinate \n",
929+
"coordinate \n",
930+
"start \n",
931+
"start \n",
932+
"x-interval \n",
933+
"end \n",
934+
"y-interval \n",
935+
"end \n",
936+
"(x1, y1) \n",
937+
"Rectangle \n",
938+
"(x2, y2) \n",
939+
"(x1, y1) \n",
940+
"Quadrilateral \n",
941+
"(x2, y2) \n",
942+
"(x4, y4) \n",
943+
"(x3, y3) \n",
944+
"The same transformation and operation APIs \n",
945+
"textblock \n",
946+
"Coordinate \n",
947+
"Extra features \n",
948+
"Block Text \n",
949+
"Block Type \n",
950+
"Reading Order \n",
951+
"... \n",
952+
"layout \n",
953+
"coordinate1, textblock1, ... \n",
954+
"..., textblock2, layout1 \n",
955+
"A list of the layout elements ](#)\n"
961956
]
962957
}
963958
],
@@ -979,8 +974,8 @@
979974
"cell_type": "code",
980975
"metadata": {
981976
"ExecuteTime": {
982-
"end_time": "2025-01-17T12:13:22.174201Z",
983-
"start_time": "2025-01-17T12:13:21.884811Z"
977+
"end_time": "2025-01-21T08:05:08.330141Z",
978+
"start_time": "2025-01-21T08:05:07.997956Z"
984979
}
985980
},
986981
"source": [

libs/community/langchain_community/document_loaders/parsers/images.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import io
33
import logging
44
from abc import abstractmethod
5+
from typing import TYPE_CHECKING, Iterable, Iterator
56

67
import numpy
78
import numpy as np

libs/community/langchain_community/document_loaders/parsers/pdf.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,8 @@
1717
Mapping,
1818
Optional,
1919
Sequence,
20-
Union, cast,
20+
Union,
21+
cast,
2122
)
2223
from urllib.parse import urlparse
2324

@@ -456,7 +457,7 @@ def extract_images_from_page(self, page: pypdf._page.PageObject) -> str:
456457
image_bytes = io.BytesIO()
457458
Image.fromarray(np_image).save(image_bytes, format="PNG")
458459
blob = Blob.from_data(image_bytes.getvalue(), mime_type="image/png")
459-
image_text=next(self.images_parser.lazy_parse(blob)).page_content
460+
image_text = next(self.images_parser.lazy_parse(blob)).page_content
460461
images.append(
461462
_format_inner_image(blob, image_text, self.images_inner_format)
462463
)

libs/community/langchain_community/document_loaders/pdf.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -615,7 +615,7 @@ def __init__(
615615
pages_delimiter: str = _DEFAULT_PAGES_DELIMITER,
616616
extract_images: bool = False,
617617
images_parser: Optional[BaseImageBlobParser] = None,
618-
images_inner_format: str = "text",
618+
images_inner_format: Literal["text", "markdown-img", "html-img"] = "text",
619619
extract_tables: Union[Literal["csv", "markdown", "html"], None] = None,
620620
headers: Optional[dict] = None,
621621
extract_tables_settings: Optional[dict[str, Any]] = None,

0 commit comments

Comments
 (0)