Skip to content

Commit f2abc3c

Browse files
committed
document_loaders: confluence: Add factory func for attachment parser
Adds a factory function that can be provided to the Conflunece Loader constructor. The factory function is called with the attachement information and should return a Blobparser that shall be used to extract the content of the document.
1 parent e807a4c commit f2abc3c

File tree

1 file changed

+145
-195
lines changed

1 file changed

+145
-195
lines changed

libs/community/langchain_community/document_loaders/confluence.py

Lines changed: 145 additions & 195 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,120 @@
1212
wait_exponential,
1313
)
1414

15-
from langchain_community.document_loaders.base import BaseLoader
15+
from langchain_community.document_loaders.base import BaseLoader, BaseBlobParser
16+
from langchain_community.document_loaders.blob_loaders import Blob
1617

1718
logger = logging.getLogger(__name__)
1819

1920

21+
class SVGParser(BaseBlobParser):
22+
"""Parser for SVG blobs."""
23+
24+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
25+
"""Parse SVG content from a blob.
26+
27+
Args:
28+
blob: Blob containing SVG data to be parsed
29+
30+
Yields:
31+
Document: Document with extracted text content
32+
"""
33+
try:
34+
import pytesseract
35+
from PIL import Image
36+
from reportlab.graphics import renderPM
37+
from svglib.svglib import svg2rlg
38+
except ImportError:
39+
raise ImportError(
40+
"`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
41+
"please run `pip install pytesseract Pillow reportlab svglib`"
42+
)
43+
drawing = svg2rlg(BytesIO(blob.as_bytes()))
44+
img_data = BytesIO()
45+
renderPM.drawToFile(drawing, img_data, fmt="PNG")
46+
img_data.seek(0)
47+
image = Image.open(img_data)
48+
text = pytesseract.image_to_string(image)
49+
yield Document(page_content=text, metadata={"source": blob.source})
50+
51+
52+
class XLSParser(BaseBlobParser):
53+
"""Parser for XLS blobs."""
54+
55+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
56+
import io
57+
import os
58+
59+
try:
60+
import xlrd
61+
except ImportError:
62+
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
63+
64+
response = blob.as_bytes()
65+
text = ""
66+
67+
workbook = xlrd.open_workbook(file_contents=response)
68+
for sheet in workbook.sheets():
69+
text += f"{sheet.name}:\n"
70+
for row in range(sheet.nrows):
71+
for col in range(sheet.ncols):
72+
text += f"{sheet.cell_value(row, col)}\t"
73+
text += "\n"
74+
text += "\n"
75+
76+
yield Document(page_content=text, metadata={"source": blob.source})
77+
78+
79+
class Doc2TXTParser(BaseBlobParser):
80+
"""Parser for DOCX blobs."""
81+
82+
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
83+
try:
84+
import docx2txt
85+
except ImportError:
86+
raise ImportError(
87+
"`docx2txt` package not found, please run `pip install docx2txt`"
88+
)
89+
yield Document(
90+
page_content=docx2txt.process(BytesIO(blob.as_bytes())),
91+
metadata={"source": blob.source},
92+
)
93+
94+
95+
def default_parser_factory(attachment_info: dict) -> Optional[BaseBlobParser]:
96+
"""Default parser factory for ConfluenceLoader.
97+
98+
This function takes the attachment information from Confluence and returns
99+
a parser for the attachment.
100+
"""
101+
mime_type = attachment_info["metadata"]["mediaType"]
102+
if mime_type == "application/pdf":
103+
from langchain_community.document_loaders.parsers.pdf import PyMuPDFParser
104+
105+
return PyMuPDFParser()
106+
elif (
107+
mime_type == "application/vnd.openxmlformats-officedocument"
108+
".wordprocessingml.document"
109+
):
110+
return Doc2TXTParser()
111+
elif (
112+
mime_type == "image/png"
113+
or mime_type == "image/jpg"
114+
or mime_type == "image/jpeg"
115+
):
116+
from langchain_community.document_loaders.parsers.images import (
117+
TesseractBlobParser,
118+
)
119+
120+
return TesseractBlobParser()
121+
elif mime_type == "application/vnd.ms-excel":
122+
return XLSParser()
123+
elif mime_type == "image/svg+xml":
124+
return SVGParser()
125+
126+
return None
127+
128+
20129
class ContentFormat(str, Enum):
21130
"""Enumerator of the content formats of Confluence page."""
22131

@@ -123,6 +232,11 @@ class ConfluenceLoader(BaseLoader):
123232
:param attachment_filter_func: A function that takes the attachment information
124233
from Confluence and decides whether or not the
125234
attachment is processed.
235+
:type attachment_filter_func: Callable[[dict], bool], optional
236+
:param attachment_parser_factory: A function that takes the attachment information
237+
from Confluence and returns a parser for the
238+
attachment.
239+
:type attachment_parser_factory: Callable[[dict], Optional[BaseBlobParser]], optional
126240
:param include_comments: defaults to False
127241
:type include_comments: bool, optional
128242
:param content_format: Specify content format, defaults to
@@ -180,6 +294,9 @@ def __init__(
180294
keep_markdown_format: bool = False,
181295
keep_newlines: bool = False,
182296
attachment_filter_func: Optional[Callable[[dict], bool]] = None,
297+
attachment_parser_factory: Optional[
298+
Callable[[dict], Optional[BaseBlobParser]]
299+
] = default_parser_factory,
183300
):
184301
self.space_key = space_key
185302
self.page_ids = page_ids
@@ -197,6 +314,7 @@ def __init__(
197314
self.keep_markdown_format = keep_markdown_format
198315
self.keep_newlines = keep_newlines
199316
self.attachment_filter_func = attachment_filter_func
317+
self.attachment_parser_factory = attachment_parser_factory
200318

201319
confluence_kwargs = confluence_kwargs or {}
202320
errors = ConfluenceLoader.validate_init_args(
@@ -675,26 +793,32 @@ def process_attachment(
675793
absolute_url = self.base_url + attachment["_links"]["download"]
676794
title = attachment["title"]
677795
try:
678-
if media_type == "application/pdf":
679-
text = title + self.process_pdf(absolute_url, ocr_languages)
680-
elif (
681-
media_type == "image/png"
682-
or media_type == "image/jpg"
683-
or media_type == "image/jpeg"
684-
):
685-
text = title + self.process_image(absolute_url, ocr_languages)
686-
elif (
687-
media_type == "application/vnd.openxmlformats-officedocument"
688-
".wordprocessingml.document"
689-
):
690-
text = title + self.process_doc(absolute_url)
691-
elif media_type == "application/vnd.ms-excel":
692-
text = title + self.process_xls(absolute_url)
693-
elif media_type == "image/svg+xml":
694-
text = title + self.process_svg(absolute_url, ocr_languages)
695-
else:
696-
continue
697-
texts.append(text)
796+
if self.attachment_parser_factory:
797+
parser = self.attachment_parser_factory(attachment)
798+
if parser is None:
799+
continue
800+
801+
response = self.confluence.request(path=absolute_url, absolute=True)
802+
803+
if (
804+
response.status_code != 200
805+
or response.content == b""
806+
or response.content is None
807+
):
808+
continue
809+
810+
blob = Blob(
811+
data=response.content,
812+
mimetype=media_type,
813+
)
814+
text = (
815+
title
816+
+ " "
817+
+ "\n\n".join(
818+
[doc.page_content for doc in parser.lazy_parse(blob)]
819+
)
820+
)
821+
texts.append(text)
698822
except requests.HTTPError as e:
699823
if e.response.status_code == 404:
700824
print(f"Attachment not found at {absolute_url}") # noqa: T201
@@ -703,177 +827,3 @@ def process_attachment(
703827
raise
704828

705829
return texts
706-
707-
def process_pdf(
708-
self,
709-
link: str,
710-
ocr_languages: Optional[str] = None,
711-
) -> str:
712-
try:
713-
import pytesseract
714-
from pdf2image import convert_from_bytes
715-
except ImportError:
716-
raise ImportError(
717-
"`pytesseract` or `pdf2image` package not found, "
718-
"please run `pip install pytesseract pdf2image`"
719-
)
720-
721-
response = self.confluence.request(path=link, absolute=True)
722-
text = ""
723-
724-
if (
725-
response.status_code != 200
726-
or response.content == b""
727-
or response.content is None
728-
):
729-
return text
730-
try:
731-
images = convert_from_bytes(response.content)
732-
except ValueError:
733-
return text
734-
735-
for i, image in enumerate(images):
736-
try:
737-
image_text = pytesseract.image_to_string(image, lang=ocr_languages)
738-
text += f"Page {i + 1}:\n{image_text}\n\n"
739-
except pytesseract.TesseractError as ex:
740-
logger.warning(f"TesseractError: {ex}")
741-
742-
return text
743-
744-
def process_image(
745-
self,
746-
link: str,
747-
ocr_languages: Optional[str] = None,
748-
) -> str:
749-
try:
750-
import pytesseract
751-
from PIL import Image
752-
except ImportError:
753-
raise ImportError(
754-
"`pytesseract` or `Pillow` package not found, "
755-
"please run `pip install pytesseract Pillow`"
756-
)
757-
758-
response = self.confluence.request(path=link, absolute=True)
759-
text = ""
760-
761-
if (
762-
response.status_code != 200
763-
or response.content == b""
764-
or response.content is None
765-
):
766-
return text
767-
try:
768-
image = Image.open(BytesIO(response.content))
769-
except OSError:
770-
return text
771-
772-
return pytesseract.image_to_string(image, lang=ocr_languages)
773-
774-
def process_doc(self, link: str) -> str:
775-
try:
776-
import docx2txt
777-
except ImportError:
778-
raise ImportError(
779-
"`docx2txt` package not found, please run `pip install docx2txt`"
780-
)
781-
782-
response = self.confluence.request(path=link, absolute=True)
783-
text = ""
784-
785-
if (
786-
response.status_code != 200
787-
or response.content == b""
788-
or response.content is None
789-
):
790-
return text
791-
file_data = BytesIO(response.content)
792-
793-
return docx2txt.process(file_data)
794-
795-
def process_xls(self, link: str) -> str:
796-
import io
797-
import os
798-
799-
try:
800-
import xlrd
801-
802-
except ImportError:
803-
raise ImportError("`xlrd` package not found, please run `pip install xlrd`")
804-
805-
try:
806-
import pandas as pd
807-
808-
except ImportError:
809-
raise ImportError(
810-
"`pandas` package not found, please run `pip install pandas`"
811-
)
812-
813-
response = self.confluence.request(path=link, absolute=True)
814-
text = ""
815-
816-
if (
817-
response.status_code != 200
818-
or response.content == b""
819-
or response.content is None
820-
):
821-
return text
822-
823-
filename = os.path.basename(link)
824-
# Getting the whole content of the url after filename,
825-
# Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
826-
file_extension = os.path.splitext(filename)[1]
827-
828-
if file_extension.startswith(
829-
".csv"
830-
): # if the extension found in the url is ".csv"
831-
content_string = response.content.decode("utf-8")
832-
df = pd.read_csv(io.StringIO(content_string))
833-
text += df.to_string(index=False, header=False) + "\n\n"
834-
else:
835-
workbook = xlrd.open_workbook(file_contents=response.content)
836-
for sheet in workbook.sheets():
837-
text += f"{sheet.name}:\n"
838-
for row in range(sheet.nrows):
839-
for col in range(sheet.ncols):
840-
text += f"{sheet.cell_value(row, col)}\t"
841-
text += "\n"
842-
text += "\n"
843-
844-
return text
845-
846-
def process_svg(
847-
self,
848-
link: str,
849-
ocr_languages: Optional[str] = None,
850-
) -> str:
851-
try:
852-
import pytesseract
853-
from PIL import Image
854-
from reportlab.graphics import renderPM
855-
from svglib.svglib import svg2rlg
856-
except ImportError:
857-
raise ImportError(
858-
"`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
859-
"please run `pip install pytesseract Pillow reportlab svglib`"
860-
)
861-
862-
response = self.confluence.request(path=link, absolute=True)
863-
text = ""
864-
865-
if (
866-
response.status_code != 200
867-
or response.content == b""
868-
or response.content is None
869-
):
870-
return text
871-
872-
drawing = svg2rlg(BytesIO(response.content))
873-
874-
img_data = BytesIO()
875-
renderPM.drawToFile(drawing, img_data, fmt="PNG")
876-
img_data.seek(0)
877-
image = Image.open(img_data)
878-
879-
return pytesseract.image_to_string(image, lang=ocr_languages)

0 commit comments

Comments
 (0)