Skip to content

Commit d173bd3

Browse files
committed
document_loaders: confluence: Add factory func for attachment parser
Adds a factory function that can be provided to the Conflunece Loader constructor. The factory function is called with the attachement information and should return a Blobparser that shall be used to extract the content of the document.
1 parent 7b19269 commit d173bd3

File tree

1 file changed

+143
-195
lines changed

1 file changed

+143
-195
lines changed

libs/community/langchain_community/document_loaders/confluence.py

Lines changed: 143 additions & 195 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,117 @@
1212
wait_exponential,
1313
)
1414

15-
from langchain_community.document_loaders.base import BaseLoader
15+
from langchain_community.document_loaders.base import BaseBlobParser, BaseLoader
16+
from langchain_community.document_loaders.blob_loaders import Blob
1617

1718
logger = logging.getLogger(__name__)
1819

1920

21+
class SVGParser(BaseBlobParser):
22+
"""Parser for SVG blobs."""
23+
24+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
25+
"""Parse SVG content from a blob.
26+
27+
Args:
28+
blob: Blob containing SVG data to be parsed
29+
30+
Yields:
31+
Document: Document with extracted text content
32+
"""
33+
try:
34+
import pytesseract
35+
from PIL import Image
36+
from reportlab.graphics import renderPM
37+
from svglib.svglib import svg2rlg
38+
except ImportError:
39+
raise ImportError(
40+
"`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
41+
"please run `pip install pytesseract Pillow reportlab svglib`"
42+
)
43+
drawing = svg2rlg(BytesIO(blob.as_bytes()))
44+
img_data = BytesIO()
45+
renderPM.drawToFile(drawing, img_data, fmt="PNG")
46+
img_data.seek(0)
47+
image = Image.open(img_data)
48+
text = pytesseract.image_to_string(image)
49+
yield Document(page_content=text, metadata={"source": blob.source})
50+
51+
52+
class XLSParser(BaseBlobParser):
53+
"""Parser for XLS blobs."""
54+
55+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
56+
try:
57+
import xlrd
58+
except ImportError:
59+
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
60+
61+
response = blob.as_bytes()
62+
text = ""
63+
64+
workbook = xlrd.open_workbook(file_contents=response)
65+
for sheet in workbook.sheets():
66+
text += f"{sheet.name}:\n"
67+
for row in range(sheet.nrows):
68+
for col in range(sheet.ncols):
69+
text += f"{sheet.cell_value(row, col)}\t"
70+
text += "\n"
71+
text += "\n"
72+
73+
yield Document(page_content=text, metadata={"source": blob.source})
74+
75+
76+
class Doc2TXTParser(BaseBlobParser):
77+
"""Parser for DOCX blobs."""
78+
79+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
80+
try:
81+
import docx2txt
82+
except ImportError:
83+
raise ImportError(
84+
"`docx2txt` package not found, please run `pip install docx2txt`"
85+
)
86+
yield Document(
87+
page_content=docx2txt.process(BytesIO(blob.as_bytes())),
88+
metadata={"source": blob.source},
89+
)
90+
91+
92+
def default_parser_factory(attachment_info: dict) -> Optional[BaseBlobParser]:
93+
"""Default parser factory for ConfluenceLoader.
94+
95+
This function takes the attachment information from Confluence and returns
96+
a parser for the attachment.
97+
"""
98+
mime_type = attachment_info["metadata"]["mediaType"]
99+
if mime_type == "application/pdf":
100+
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
101+
102+
return PyMuPDFParser()
103+
elif (
104+
mime_type == "application/vnd.openxmlformats-officedocument"
105+
".wordprocessingml.document"
106+
):
107+
return Doc2TXTParser()
108+
elif (
109+
mime_type == "image/png"
110+
or mime_type == "image/jpg"
111+
or mime_type == "image/jpeg"
112+
):
113+
from langchain_community.document_loaders.parsers.images import (
114+
TesseractBlobParser,
115+
)
116+
117+
return TesseractBlobParser()
118+
elif mime_type == "application/vnd.ms-excel":
119+
return XLSParser()
120+
elif mime_type == "image/svg+xml":
121+
return SVGParser()
122+
123+
return None
124+
125+
20126
class ContentFormat(str, Enum):
21127
"""Enumerator of the content formats of Confluence page."""
22128

@@ -123,6 +229,12 @@ class ConfluenceLoader(BaseLoader):
123229
:param attachment_filter_func: A function that takes the attachment information
124230
from Confluence and decides whether or not the
125231
attachment is processed.
232+
:type attachment_filter_func: Callable[[dict], bool], optional
233+
:param attachment_parser_factory: A function that takes the attachment information
234+
from Confluence and returns a parser for the
235+
attachment.
236+
:type attachment_parser_factory:
237+
Callable[[dict], Optional[BaseBlobParser]], optional
126238
:param include_comments: defaults to False
127239
:type include_comments: bool, optional
128240
:param content_format: Specify content format, defaults to
@@ -180,6 +292,9 @@ def __init__(
180292
keep_markdown_format: bool = False,
181293
keep_newlines: bool = False,
182294
attachment_filter_func: Optional[Callable[[dict], bool]] = None,
295+
attachment_parser_factory: Optional[
296+
Callable[[dict], Optional[BaseBlobParser]]
297+
] = default_parser_factory,
183298
):
184299
self.space_key = space_key
185300
self.page_ids = page_ids
@@ -197,6 +312,7 @@ def __init__(
197312
self.keep_markdown_format = keep_markdown_format
198313
self.keep_newlines = keep_newlines
199314
self.attachment_filter_func = attachment_filter_func
315+
self.attachment_parser_factory = attachment_parser_factory
200316

201317
confluence_kwargs = confluence_kwargs or {}
202318
errors = ConfluenceLoader.validate_init_args(
@@ -675,26 +791,32 @@ def process_attachment(
675791
absolute_url = self.base_url + attachment["_links"]["download"]
676792
title = attachment["title"]
677793
try:
678-
if media_type == "application/pdf":
679-
text = title + self.process_pdf(absolute_url, ocr_languages)
680-
elif (
681-
media_type == "image/png"
682-
or media_type == "image/jpg"
683-
or media_type == "image/jpeg"
684-
):
685-
text = title + self.process_image(absolute_url, ocr_languages)
686-
elif (
687-
media_type == "application/vnd.openxmlformats-officedocument"
688-
".wordprocessingml.document"
689-
):
690-
text = title + self.process_doc(absolute_url)
691-
elif media_type == "application/vnd.ms-excel":
692-
text = title + self.process_xls(absolute_url)
693-
elif media_type == "image/svg+xml":
694-
text = title + self.process_svg(absolute_url, ocr_languages)
695-
else:
696-
continue
697-
texts.append(text)
794+
if self.attachment_parser_factory:
795+
parser = self.attachment_parser_factory(attachment)
796+
if parser is None:
797+
continue
798+
799+
response = self.confluence.request(path=absolute_url, absolute=True)
800+
801+
if (
802+
response.status_code != 200
803+
or response.content == b""
804+
or response.content is None
805+
):
806+
continue
807+
808+
blob = Blob(
809+
data=response.content,
810+
mimetype=media_type,
811+
)
812+
text = (
813+
title
814+
+ " "
815+
+ "\n\n".join(
816+
[doc.page_content for doc in parser.lazy_parse(blob)]
817+
)
818+
)
819+
texts.append(text)
698820
except requests.HTTPError as e:
699821
if e.response.status_code == 404:
700822
print(f"Attachment not found at {absolute_url}") # noqa: T201
@@ -703,177 +825,3 @@ def process_attachment(
703825
raise
704826

705827
return texts
706-
707-
def process_pdf(
708-
self,
709-
link: str,
710-
ocr_languages: Optional[str] = None,
711-
) -> str:
712-
try:
713-
import pytesseract
714-
from pdf2image import convert_from_bytes
715-
except ImportError:
716-
raise ImportError(
717-
"`pytesseract` or `pdf2image` package not found, "
718-
"please run `pip install pytesseract pdf2image`"
719-
)
720-
721-
response = self.confluence.request(path=link, absolute=True)
722-
text = ""
723-
724-
if (
725-
response.status_code != 200
726-
or response.content == b""
727-
or response.content is None
728-
):
729-
return text
730-
try:
731-
images = convert_from_bytes(response.content)
732-
except ValueError:
733-
return text
734-
735-
for i, image in enumerate(images):
736-
try:
737-
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
738-
text += f"Page {i + 1}:\n{image_text}\n\n"
739-
except pytesseract.TesseractError as ex:
740-
logger.warning(f"TesseractError: {ex}")
741-
742-
return text
743-
744-
def process_image(
745-
self,
746-
link: str,
747-
ocr_languages: Optional[str] = None,
748-
) -> str:
749-
try:
750-
import pytesseract
751-
from PIL import Image
752-
except ImportError:
753-
raise ImportError(
754-
"`pytesseract` or `Pillow` package not found, "
755-
"please run `pip install pytesseract Pillow`"
756-
)
757-
758-
response = self.confluence.request(path=link, absolute=True)
759-
text = ""
760-
761-
if (
762-
response.status_code != 200
763-
or response.content == b""
764-
or response.content is None
765-
):
766-
return text
767-
try:
768-
image = Image.open(BytesIO(response.content))
769-
except OSError:
770-
return text
771-
772-
return pytesseract.image_to_string(image, lang=ocr_languages)
773-
774-
def process_doc(self, link: str) -> str:
775-
try:
776-
import docx2txt
777-
except ImportError:
778-
raise ImportError(
779-
"`docx2txt` package not found, please run `pip install docx2txt`"
780-
)
781-
782-
response = self.confluence.request(path=link, absolute=True)
783-
text = ""
784-
785-
if (
786-
response.status_code != 200
787-
or response.content == b""
788-
or response.content is None
789-
):
790-
return text
791-
file_data = BytesIO(response.content)
792-
793-
return docx2txt.process(file_data)
794-
795-
def process_xls(self, link: str) -> str:
796-
import io
797-
import os
798-
799-
try:
800-
import xlrd
801-
802-
except ImportError:
803-
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
804-
805-
try:
806-
import pandas as pd
807-
808-
except ImportError:
809-
raise ImportError(
810-
"`pandas` package not found, please run `pip install pandas`"
811-
)
812-
813-
response = self.confluence.request(path=link, absolute=True)
814-
text = ""
815-
816-
if (
817-
response.status_code != 200
818-
or response.content == b""
819-
or response.content is None
820-
):
821-
return text
822-
823-
filename = os.path.basename(link)
824-
# Getting the whole content of the url after filename,
825-
# Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
826-
file_extension = os.path.splitext(filename)[1]
827-
828-
if file_extension.startswith(
829-
".csv"
830-
): # if the extension found in the url is ".csv"
831-
content_string = response.content.decode("utf-8")
832-
df = pd.read_csv(io.StringIO(content_string))
833-
text += df.to_string(index=False, header=False) + "\n\n"
834-
else:
835-
workbook = xlrd.open_workbook(file_contents=response.content)
836-
for sheet in workbook.sheets():
837-
text += f"{sheet.name}:\n"
838-
for row in range(sheet.nrows):
839-
for col in range(sheet.ncols):
840-
text += f"{sheet.cell_value(row, col)}\t"
841-
text += "\n"
842-
text += "\n"
843-
844-
return text
845-
846-
def process_svg(
847-
self,
848-
link: str,
849-
ocr_languages: Optional[str] = None,
850-
) -> str:
851-
try:
852-
import pytesseract
853-
from PIL import Image
854-
from reportlab.graphics import renderPM
855-
from svglib.svglib import svg2rlg
856-
except ImportError:
857-
raise ImportError(
858-
"`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
859-
"please run `pip install pytesseract Pillow reportlab svglib`"
860-
)
861-
862-
response = self.confluence.request(path=link, absolute=True)
863-
text = ""
864-
865-
if (
866-
response.status_code != 200
867-
or response.content == b""
868-
or response.content is None
869-
):
870-
return text
871-
872-
drawing = svg2rlg(BytesIO(response.content))
873-
874-
img_data = BytesIO()
875-
renderPM.drawToFile(drawing, img_data, fmt="PNG")
876-
img_data.seek(0)
877-
image = Image.open(img_data)
878-
879-
return pytesseract.image_to_string(image, lang=ocr_languages)

0 commit comments

Comments
 (0)