1212 wait_exponential ,
1313)
1414
15- from langchain_community .document_loaders .base import BaseLoader
15+ from langchain_community .document_loaders .base import BaseBlobParser , BaseLoader
16+ from langchain_community .document_loaders .blob_loaders import Blob
1617
1718logger = logging .getLogger (__name__ )
1819
1920
21+ class SVGParser (BaseBlobParser ):
22+ """Parser for SVG blobs."""
23+
24+ def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
25+ """Parse SVG content from a blob.
26+
27+ Args:
28+ blob: Blob containing SVG data to be parsed
29+
30+ Yields:
31+ Document: Document with extracted text content
32+ """
33+ try :
34+ import pytesseract
35+ from PIL import Image
36+ from reportlab .graphics import renderPM
37+ from svglib .svglib import svg2rlg
38+ except ImportError :
39+ raise ImportError (
40+ "`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
41+ "please run `pip install pytesseract Pillow reportlab svglib`"
42+ )
43+ drawing = svg2rlg (BytesIO (blob .as_bytes ()))
44+ img_data = BytesIO ()
45+ renderPM .drawToFile (drawing , img_data , fmt = "PNG" )
46+ img_data .seek (0 )
47+ image = Image .open (img_data )
48+ text = pytesseract .image_to_string (image )
49+ yield Document (page_content = text , metadata = {"source" : blob .source })
50+
51+
52+ class XLSParser (BaseBlobParser ):
53+ """Parser for XLS blobs."""
54+
55+ def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
56+ try :
57+ import xlrd
58+ except ImportError :
59+ raise ImportError ("`xlrd` package not found, please run `pip install xlrd`" )
60+
61+ response = blob .as_bytes ()
62+ text = ""
63+
64+ workbook = xlrd .open_workbook (file_contents = response )
65+ for sheet in workbook .sheets ():
66+ text += f"{ sheet .name } :\n "
67+ for row in range (sheet .nrows ):
68+ for col in range (sheet .ncols ):
69+ text += f"{ sheet .cell_value (row , col )} \t "
70+ text += "\n "
71+ text += "\n "
72+
73+ yield Document (page_content = text , metadata = {"source" : blob .source })
74+
75+
76+ class Doc2TXTParser (BaseBlobParser ):
77+ """Parser for DOCX blobs."""
78+
79+ def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
80+ try :
81+ import docx2txt
82+ except ImportError :
83+ raise ImportError (
84+ "`docx2txt` package not found, please run `pip install docx2txt`"
85+ )
86+ yield Document (
87+ page_content = docx2txt .process (BytesIO (blob .as_bytes ())),
88+ metadata = {"source" : blob .source },
89+ )
90+
91+
92+ def default_parser_factory (attachment_info : dict ) -> Optional [BaseBlobParser ]:
93+ """Default parser factory for ConfluenceLoader.
94+
95+ This function takes the attachment information from Confluence and returns
96+ a parser for the attachment.
97+ """
98+ mime_type = attachment_info ["metadata" ]["mediaType" ]
99+ if mime_type == "application/pdf" :
100+ from langchain_community .document_loaders .parsers .pdf import PyMuPDFParser
101+
102+ return PyMuPDFParser ()
103+ elif (
104+ mime_type == "application/vnd.openxmlformats-officedocument"
105+ ".wordprocessingml.document"
106+ ):
107+ return Doc2TXTParser ()
108+ elif (
109+ mime_type == "image/png"
110+ or mime_type == "image/jpg"
111+ or mime_type == "image/jpeg"
112+ ):
113+ from langchain_community .document_loaders .parsers .images import (
114+ TesseractBlobParser ,
115+ )
116+
117+ return TesseractBlobParser ()
118+ elif mime_type == "application/vnd.ms-excel" :
119+ return XLSParser ()
120+ elif mime_type == "image/svg+xml" :
121+ return SVGParser ()
122+
123+ return None
124+
125+
20126class ContentFormat (str , Enum ):
21127 """Enumerator of the content formats of Confluence page."""
22128
@@ -123,6 +229,12 @@ class ConfluenceLoader(BaseLoader):
123229 :param attachment_filter_func: A function that takes the attachment information
124230 from Confluence and decides whether or not the
125231 attachment is processed.
232+ :type attachment_filter_func: Callable[[dict], bool], optional
233+ :param attachment_parser_factory: A function that takes the attachment information
234+ from Confluence and returns a parser for the
235+ attachment.
236+ :type attachment_parser_factory:
237+ Callable[[dict], Optional[BaseBlobParser]], optional
126238 :param include_comments: defaults to False
127239 :type include_comments: bool, optional
128240 :param content_format: Specify content format, defaults to
@@ -180,6 +292,9 @@ def __init__(
180292 keep_markdown_format : bool = False ,
181293 keep_newlines : bool = False ,
182294 attachment_filter_func : Optional [Callable [[dict ], bool ]] = None ,
295+ attachment_parser_factory : Optional [
296+ Callable [[dict ], Optional [BaseBlobParser ]]
297+ ] = default_parser_factory ,
183298 ):
184299 self .space_key = space_key
185300 self .page_ids = page_ids
@@ -197,6 +312,7 @@ def __init__(
197312 self .keep_markdown_format = keep_markdown_format
198313 self .keep_newlines = keep_newlines
199314 self .attachment_filter_func = attachment_filter_func
315+ self .attachment_parser_factory = attachment_parser_factory
200316
201317 confluence_kwargs = confluence_kwargs or {}
202318 errors = ConfluenceLoader .validate_init_args (
@@ -675,26 +791,32 @@ def process_attachment(
675791 absolute_url = self .base_url + attachment ["_links" ]["download" ]
676792 title = attachment ["title" ]
677793 try :
678- if media_type == "application/pdf" :
679- text = title + self .process_pdf (absolute_url , ocr_languages )
680- elif (
681- media_type == "image/png"
682- or media_type == "image/jpg"
683- or media_type == "image/jpeg"
684- ):
685- text = title + self .process_image (absolute_url , ocr_languages )
686- elif (
687- media_type == "application/vnd.openxmlformats-officedocument"
688- ".wordprocessingml.document"
689- ):
690- text = title + self .process_doc (absolute_url )
691- elif media_type == "application/vnd.ms-excel" :
692- text = title + self .process_xls (absolute_url )
693- elif media_type == "image/svg+xml" :
694- text = title + self .process_svg (absolute_url , ocr_languages )
695- else :
696- continue
697- texts .append (text )
794+ if self .attachment_parser_factory :
795+ parser = self .attachment_parser_factory (attachment )
796+ if parser is None :
797+ continue
798+
799+ response = self .confluence .request (path = absolute_url , absolute = True )
800+
801+ if (
802+ response .status_code != 200
803+ or response .content == b""
804+ or response .content is None
805+ ):
806+ continue
807+
808+ blob = Blob (
809+ data = response .content ,
810+ mimetype = media_type ,
811+ )
812+ text = (
813+ title
814+ + " "
815+ + "\n \n " .join (
816+ [doc .page_content for doc in parser .lazy_parse (blob )]
817+ )
818+ )
819+ texts .append (text )
698820 except requests .HTTPError as e :
699821 if e .response .status_code == 404 :
700822 print (f"Attachment not found at { absolute_url } " ) # noqa: T201
@@ -703,177 +825,3 @@ def process_attachment(
703825 raise
704826
705827 return texts
706-
707- def process_pdf (
708- self ,
709- link : str ,
710- ocr_languages : Optional [str ] = None ,
711- ) -> str :
712- try :
713- import pytesseract
714- from pdf2image import convert_from_bytes
715- except ImportError :
716- raise ImportError (
717- "`pytesseract` or `pdf2image` package not found, "
718- "please run `pip install pytesseract pdf2image`"
719- )
720-
721- response = self .confluence .request (path = link , absolute = True )
722- text = ""
723-
724- if (
725- response .status_code != 200
726- or response .content == b""
727- or response .content is None
728- ):
729- return text
730- try :
731- images = convert_from_bytes (response .content )
732- except ValueError :
733- return text
734-
735- for i , image in enumerate (images ):
736- try :
737- image_text = pytesseract .image_to_string (image , lang = ocr_languages )
738- text += f"Page { i + 1 } :\n { image_text } \n \n "
739- except pytesseract .TesseractError as ex :
740- logger .warning (f"TesseractError: { ex } " )
741-
742- return text
743-
744- def process_image (
745- self ,
746- link : str ,
747- ocr_languages : Optional [str ] = None ,
748- ) -> str :
749- try :
750- import pytesseract
751- from PIL import Image
752- except ImportError :
753- raise ImportError (
754- "`pytesseract` or `Pillow` package not found, "
755- "please run `pip install pytesseract Pillow`"
756- )
757-
758- response = self .confluence .request (path = link , absolute = True )
759- text = ""
760-
761- if (
762- response .status_code != 200
763- or response .content == b""
764- or response .content is None
765- ):
766- return text
767- try :
768- image = Image .open (BytesIO (response .content ))
769- except OSError :
770- return text
771-
772- return pytesseract .image_to_string (image , lang = ocr_languages )
773-
774- def process_doc (self , link : str ) -> str :
775- try :
776- import docx2txt
777- except ImportError :
778- raise ImportError (
779- "`docx2txt` package not found, please run `pip install docx2txt`"
780- )
781-
782- response = self .confluence .request (path = link , absolute = True )
783- text = ""
784-
785- if (
786- response .status_code != 200
787- or response .content == b""
788- or response .content is None
789- ):
790- return text
791- file_data = BytesIO (response .content )
792-
793- return docx2txt .process (file_data )
794-
795- def process_xls (self , link : str ) -> str :
796- import io
797- import os
798-
799- try :
800- import xlrd
801-
802- except ImportError :
803- raise ImportError ("`xlrd` package not found, please run `pip install xlrd`" )
804-
805- try :
806- import pandas as pd
807-
808- except ImportError :
809- raise ImportError (
810- "`pandas` package not found, please run `pip install pandas`"
811- )
812-
813- response = self .confluence .request (path = link , absolute = True )
814- text = ""
815-
816- if (
817- response .status_code != 200
818- or response .content == b""
819- or response .content is None
820- ):
821- return text
822-
823- filename = os .path .basename (link )
824- # Getting the whole content of the url after filename,
825- # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
826- file_extension = os .path .splitext (filename )[1 ]
827-
828- if file_extension .startswith (
829- ".csv"
830- ): # if the extension found in the url is ".csv"
831- content_string = response .content .decode ("utf-8" )
832- df = pd .read_csv (io .StringIO (content_string ))
833- text += df .to_string (index = False , header = False ) + "\n \n "
834- else :
835- workbook = xlrd .open_workbook (file_contents = response .content )
836- for sheet in workbook .sheets ():
837- text += f"{ sheet .name } :\n "
838- for row in range (sheet .nrows ):
839- for col in range (sheet .ncols ):
840- text += f"{ sheet .cell_value (row , col )} \t "
841- text += "\n "
842- text += "\n "
843-
844- return text
845-
846- def process_svg (
847- self ,
848- link : str ,
849- ocr_languages : Optional [str ] = None ,
850- ) -> str :
851- try :
852- import pytesseract
853- from PIL import Image
854- from reportlab .graphics import renderPM
855- from svglib .svglib import svg2rlg
856- except ImportError :
857- raise ImportError (
858- "`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
859- "please run `pip install pytesseract Pillow reportlab svglib`"
860- )
861-
862- response = self .confluence .request (path = link , absolute = True )
863- text = ""
864-
865- if (
866- response .status_code != 200
867- or response .content == b""
868- or response .content is None
869- ):
870- return text
871-
872- drawing = svg2rlg (BytesIO (response .content ))
873-
874- img_data = BytesIO ()
875- renderPM .drawToFile (drawing , img_data , fmt = "PNG" )
876- img_data .seek (0 )
877- image = Image .open (img_data )
878-
879- return pytesseract .image_to_string (image , lang = ocr_languages )
0 commit comments