1212 wait_exponential ,
1313)
1414
15- from langchain_community .document_loaders .base import BaseLoader
15+ from langchain_community .document_loaders .base import BaseLoader , BaseBlobParser
16+ from langchain_community .document_loaders .blob_loaders import Blob
1617
1718logger = logging .getLogger (__name__ )
1819
1920
21+ class SVGParser (BaseBlobParser ):
22+ """Parser for SVG blobs."""
23+
24+ def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
25+ """Parse SVG content from a blob.
26+
27+ Args:
28+ blob: Blob containing SVG data to be parsed
29+
30+ Yields:
31+ Document: Document with extracted text content
32+ """
33+ try :
34+ import pytesseract
35+ from PIL import Image
36+ from reportlab .graphics import renderPM
37+ from svglib .svglib import svg2rlg
38+ except ImportError :
39+ raise ImportError (
40+ "`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
41+ "please run `pip install pytesseract Pillow reportlab svglib`"
42+ )
43+ drawing = svg2rlg (BytesIO (blob .as_bytes ()))
44+ img_data = BytesIO ()
45+ renderPM .drawToFile (drawing , img_data , fmt = "PNG" )
46+ img_data .seek (0 )
47+ image = Image .open (img_data )
48+ text = pytesseract .image_to_string (image )
49+ yield Document (page_content = text , metadata = {"source" : blob .source })
50+
51+
52+ class XLSParser (BaseBlobParser ):
53+ """Parser for XLS blobs."""
54+
55+ def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
56+ import io
57+ import os
58+
59+ try :
60+ import xlrd
61+ except ImportError :
62+ raise ImportError ("`xlrd` package not found, please run `pip install xlrd`" )
63+
64+ response = blob .as_bytes ()
65+ text = ""
66+
67+ workbook = xlrd .open_workbook (file_contents = response )
68+ for sheet in workbook .sheets ():
69+ text += f"{ sheet .name } :\n "
70+ for row in range (sheet .nrows ):
71+ for col in range (sheet .ncols ):
72+ text += f"{ sheet .cell_value (row , col )} \t "
73+ text += "\n "
74+ text += "\n "
75+
76+ yield Document (page_content = text , metadata = {"source" : blob .source })
77+
78+
79+ class Doc2TXTParser (BaseBlobParser ):
80+ """Parser for DOCX blobs."""
81+
82+ def lazy_parse (self , blob : Blob ) -> Iterator [Document ]:
83+ try :
84+ import docx2txt
85+ except ImportError :
86+ raise ImportError (
87+ "`docx2txt` package not found, please run `pip install docx2txt`"
88+ )
89+ yield Document (
90+ page_content = docx2txt .process (BytesIO (blob .as_bytes ())),
91+ metadata = {"source" : blob .source },
92+ )
93+
94+
95+ def default_parser_factory (attachment_info : dict ) -> Optional [BaseBlobParser ]:
96+ """Default parser factory for ConfluenceLoader.
97+
98+ This function takes the attachment information from Confluence and returns
99+ a parser for the attachment.
100+ """
101+ mime_type = attachment_info ["metadata" ]["mediaType" ]
102+ if mime_type == "application/pdf" :
103+ from langchain_community .document_loaders .parsers .pdf import PyMuPDFParser
104+
105+ return PyMuPDFParser ()
106+ elif (
107+ mime_type == "application/vnd.openxmlformats-officedocument"
108+ ".wordprocessingml.document"
109+ ):
110+ return Doc2TXTParser ()
111+ elif (
112+ mime_type == "image/png"
113+ or mime_type == "image/jpg"
114+ or mime_type == "image/jpeg"
115+ ):
116+ from langchain_community .document_loaders .parsers .images import (
117+ TesseractBlobParser ,
118+ )
119+
120+ return TesseractBlobParser ()
121+ elif mime_type == "application/vnd.ms-excel" :
122+ return XLSParser ()
123+ elif mime_type == "image/svg+xml" :
124+ return SVGParser ()
125+
126+ return None
127+
128+
20129class ContentFormat (str , Enum ):
21130 """Enumerator of the content formats of Confluence page."""
22131
@@ -123,6 +232,11 @@ class ConfluenceLoader(BaseLoader):
123232 :param attachment_filter_func: A function that takes the attachment information
124233 from Confluence and decides whether or not the
125234 attachment is processed.
235+ :type attachment_filter_func: Callable[[dict], bool], optional
236+ :param attachment_parser_factory: A function that takes the attachment information
237+ from Confluence and returns a parser for the
238+ attachment.
239+ :type attachment_parser_factory: Callable[[dict], Optional[BaseBlobParser]], optional
126240 :param include_comments: defaults to False
127241 :type include_comments: bool, optional
128242 :param content_format: Specify content format, defaults to
@@ -180,6 +294,9 @@ def __init__(
180294 keep_markdown_format : bool = False ,
181295 keep_newlines : bool = False ,
182296 attachment_filter_func : Optional [Callable [[dict ], bool ]] = None ,
297+ attachment_parser_factory : Optional [
298+ Callable [[dict ], Optional [BaseBlobParser ]]
299+ ] = default_parser_factory ,
183300 ):
184301 self .space_key = space_key
185302 self .page_ids = page_ids
@@ -197,6 +314,7 @@ def __init__(
197314 self .keep_markdown_format = keep_markdown_format
198315 self .keep_newlines = keep_newlines
199316 self .attachment_filter_func = attachment_filter_func
317+ self .attachment_parser_factory = attachment_parser_factory
200318
201319 confluence_kwargs = confluence_kwargs or {}
202320 errors = ConfluenceLoader .validate_init_args (
@@ -675,26 +793,32 @@ def process_attachment(
675793 absolute_url = self .base_url + attachment ["_links" ]["download" ]
676794 title = attachment ["title" ]
677795 try :
678- if media_type == "application/pdf" :
679- text = title + self .process_pdf (absolute_url , ocr_languages )
680- elif (
681- media_type == "image/png"
682- or media_type == "image/jpg"
683- or media_type == "image/jpeg"
684- ):
685- text = title + self .process_image (absolute_url , ocr_languages )
686- elif (
687- media_type == "application/vnd.openxmlformats-officedocument"
688- ".wordprocessingml.document"
689- ):
690- text = title + self .process_doc (absolute_url )
691- elif media_type == "application/vnd.ms-excel" :
692- text = title + self .process_xls (absolute_url )
693- elif media_type == "image/svg+xml" :
694- text = title + self .process_svg (absolute_url , ocr_languages )
695- else :
696- continue
697- texts .append (text )
796+ if self .attachment_parser_factory :
797+ parser = self .attachment_parser_factory (attachment )
798+ if parser is None :
799+ continue
800+
801+ response = self .confluence .request (path = absolute_url , absolute = True )
802+
803+ if (
804+ response .status_code != 200
805+ or response .content == b""
806+ or response .content is None
807+ ):
808+ continue
809+
810+ blob = Blob (
811+ data = response .content ,
812+ mimetype = media_type ,
813+ )
814+ text = (
815+ title
816+ + " "
817+ + "\n \n " .join (
818+ [doc .page_content for doc in parser .lazy_parse (blob )]
819+ )
820+ )
821+ texts .append (text )
698822 except requests .HTTPError as e :
699823 if e .response .status_code == 404 :
700824 print (f"Attachment not found at { absolute_url } " ) # noqa: T201
@@ -703,177 +827,3 @@ def process_attachment(
703827 raise
704828
705829 return texts
706-
707- def process_pdf (
708- self ,
709- link : str ,
710- ocr_languages : Optional [str ] = None ,
711- ) -> str :
712- try :
713- import pytesseract
714- from pdf2image import convert_from_bytes
715- except ImportError :
716- raise ImportError (
717- "`pytesseract` or `pdf2image` package not found, "
718- "please run `pip install pytesseract pdf2image`"
719- )
720-
721- response = self .confluence .request (path = link , absolute = True )
722- text = ""
723-
724- if (
725- response .status_code != 200
726- or response .content == b""
727- or response .content is None
728- ):
729- return text
730- try :
731- images = convert_from_bytes (response .content )
732- except ValueError :
733- return text
734-
735- for i , image in enumerate (images ):
736- try :
737- image_text = pytesseract .image_to_string (image , lang = ocr_languages )
738- text += f"Page { i + 1 } :\n { image_text } \n \n "
739- except pytesseract .TesseractError as ex :
740- logger .warning (f"TesseractError: { ex } " )
741-
742- return text
743-
744- def process_image (
745- self ,
746- link : str ,
747- ocr_languages : Optional [str ] = None ,
748- ) -> str :
749- try :
750- import pytesseract
751- from PIL import Image
752- except ImportError :
753- raise ImportError (
754- "`pytesseract` or `Pillow` package not found, "
755- "please run `pip install pytesseract Pillow`"
756- )
757-
758- response = self .confluence .request (path = link , absolute = True )
759- text = ""
760-
761- if (
762- response .status_code != 200
763- or response .content == b""
764- or response .content is None
765- ):
766- return text
767- try :
768- image = Image .open (BytesIO (response .content ))
769- except OSError :
770- return text
771-
772- return pytesseract .image_to_string (image , lang = ocr_languages )
773-
774- def process_doc (self , link : str ) -> str :
775- try :
776- import docx2txt
777- except ImportError :
778- raise ImportError (
779- "`docx2txt` package not found, please run `pip install docx2txt`"
780- )
781-
782- response = self .confluence .request (path = link , absolute = True )
783- text = ""
784-
785- if (
786- response .status_code != 200
787- or response .content == b""
788- or response .content is None
789- ):
790- return text
791- file_data = BytesIO (response .content )
792-
793- return docx2txt .process (file_data )
794-
795- def process_xls (self , link : str ) -> str :
796- import io
797- import os
798-
799- try :
800- import xlrd
801-
802- except ImportError :
803- raise ImportError ("`xlrd` package not found, please run `pip install xlrd`" )
804-
805- try :
806- import pandas as pd
807-
808- except ImportError :
809- raise ImportError (
810- "`pandas` package not found, please run `pip install pandas`"
811- )
812-
813- response = self .confluence .request (path = link , absolute = True )
814- text = ""
815-
816- if (
817- response .status_code != 200
818- or response .content == b""
819- or response .content is None
820- ):
821- return text
822-
823- filename = os .path .basename (link )
824- # Getting the whole content of the url after filename,
825- # Example: ".csv?version=2&modificationDate=1631800010678&cacheVersion=1&api=v2"
826- file_extension = os .path .splitext (filename )[1 ]
827-
828- if file_extension .startswith (
829- ".csv"
830- ): # if the extension found in the url is ".csv"
831- content_string = response .content .decode ("utf-8" )
832- df = pd .read_csv (io .StringIO (content_string ))
833- text += df .to_string (index = False , header = False ) + "\n \n "
834- else :
835- workbook = xlrd .open_workbook (file_contents = response .content )
836- for sheet in workbook .sheets ():
837- text += f"{ sheet .name } :\n "
838- for row in range (sheet .nrows ):
839- for col in range (sheet .ncols ):
840- text += f"{ sheet .cell_value (row , col )} \t "
841- text += "\n "
842- text += "\n "
843-
844- return text
845-
846- def process_svg (
847- self ,
848- link : str ,
849- ocr_languages : Optional [str ] = None ,
850- ) -> str :
851- try :
852- import pytesseract
853- from PIL import Image
854- from reportlab .graphics import renderPM
855- from svglib .svglib import svg2rlg
856- except ImportError :
857- raise ImportError (
858- "`pytesseract`, `Pillow`, `reportlab` or `svglib` package not found, "
859- "please run `pip install pytesseract Pillow reportlab svglib`"
860- )
861-
862- response = self .confluence .request (path = link , absolute = True )
863- text = ""
864-
865- if (
866- response .status_code != 200
867- or response .content == b""
868- or response .content is None
869- ):
870- return text
871-
872- drawing = svg2rlg (BytesIO (response .content ))
873-
874- img_data = BytesIO ()
875- renderPM .drawToFile (drawing , img_data , fmt = "PNG" )
876- img_data .seek (0 )
877- image = Image .open (img_data )
878-
879- return pytesseract .image_to_string (image , lang = ocr_languages )
0 commit comments