11"""
22ParseNode Module
33"""
4- from typing import List , Optional
4+ import re
5+ from typing import List , Optional , Tuple
6+ from urllib .parse import urljoin
57from langchain_community .document_transformers import Html2TextTransformer
68from langchain_core .documents import Document
79from .base_node import BaseNode
810from ..utils .split_text_into_chunks import split_text_into_chunks
11+ from ..helpers import default_filters
912
1013class ParseNode (BaseNode ):
1114 """
@@ -40,6 +43,9 @@ def __init__(
4043 self .parse_html = (
4144 True if node_config is None else node_config .get ("parse_html" , True )
4245 )
46+ self .parse_urls = (
47+ False if node_config is None else node_config .get ("parse_urls" , False )
48+ )
4349
4450 self .llm_model = node_config .get ("llm_model" )
4551 self .chunk_size = node_config .get ("chunk_size" )
@@ -66,16 +72,21 @@ def execute(self, state: dict) -> dict:
6672
6773 input_data = [state [key ] for key in input_keys ]
6874 docs_transformed = input_data [0 ]
75+ source = input_data [1 ] if self .parse_urls else None
6976
7077 if self .parse_html :
7178 docs_transformed = Html2TextTransformer (ignore_links = False ).transform_documents (input_data [0 ])
7279 docs_transformed = docs_transformed [0 ]
7380
81+ link_urls , img_urls = self ._extract_urls (docs_transformed .page_content , source )
82+
7483 chunks = split_text_into_chunks (text = docs_transformed .page_content ,
7584 chunk_size = self .chunk_size - 250 , model = self .llm_model )
7685 else :
7786 docs_transformed = docs_transformed [0 ]
7887
88+ link_urls , img_urls = self ._extract_urls (docs_transformed .page_content , source )
89+
7990 chunk_size = self .chunk_size
8091 chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
8192
@@ -89,5 +100,57 @@ def execute(self, state: dict) -> dict:
89100 model = self .llm_model )
90101
91102 state .update ({self .output [0 ]: chunks })
103+ if self .parse_urls :
104+ state .update ({self .output [1 ]: link_urls })
105+ state .update ({self .output [2 ]: img_urls })
92106
93107 return state
108+
109+ def _extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
110+ """
111+ Extracts URLs from the given text.
112+
113+ Args:
114+ text (str): The text to extract URLs from.
115+
116+ Returns:
117+ Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
118+ """
119+ if not self .parse_urls :
120+ return [], []
121+
122+ image_extensions = default_filters .filter_dict ["img_exts" ]
123+ image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
124+ url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
125+
126+ all_urls = url_pattern .findall (text )
127+ all_urls = self ._clean_urls (all_urls )
128+
129+ if not source .startswith ("http" ):
130+ all_urls = [url for url in all_urls if url .startswith ("http" )]
131+ else :
132+ all_urls = [urljoin (source , url ) for url in all_urls ]
133+
134+ images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
135+ links = [url for url in all_urls if url not in images ]
136+
137+ return links , images
138+
139+ def _clean_urls (self , urls : List [str ]) -> List [str ]:
140+ """
141+ Cleans the URLs extracted from the text.
142+
143+ Args:
144+ urls (List[str]): The list of URLs to clean.
145+
146+ Returns:
147+ List[str]: The cleaned URLs.
148+ """
149+ cleaned_urls = []
150+ for url in urls :
151+ url = re .sub (r'.*?\]\(' , '' , url )
152+ url = url .rstrip (').' )
153+
154+ cleaned_urls .append (url )
155+
156+ return cleaned_urls
0 commit comments