11"""
22ParseNode Module
33"""
4- from typing import List , Optional
4+ from typing import Tuple , List , Optional
5+ from urllib .parse import urljoin
56from semchunk import chunk
67from langchain_community .document_transformers import Html2TextTransformer
78from langchain_core .documents import Document
89from .base_node import BaseNode
10+ from ..helpers import default_filters
11+
12+ import re
913
1014class ParseNode (BaseNode ):
1115 """
@@ -41,6 +45,66 @@ def __init__(
4145 True if node_config is None else node_config .get ("parse_html" , True )
4246 )
4347 self .llm_model = node_config ['llm_model' ]
48+ self .parse_urls = (
49+ False if node_config is None else node_config .get ("parse_urls" , False )
50+ )
51+
52+ def _clean_urls (self , urls : List [str ]) -> List [str ]:
53+ """
54+ Cleans the URLs extracted from the text.
55+
56+ Args:
57+ urls (List[str]): The list of URLs to clean.
58+
59+ Returns:
60+ List[str]: The cleaned URLs.
61+ """
62+ cleaned_urls = []
63+ for url in urls :
64+ # Remove any leading 'thumbnail](' or similar patterns
65+ url = re .sub (r'.*?\]\(' , '' , url )
66+
67+ # Remove any trailing parentheses or brackets
68+ url = url .rstrip (').' )
69+
70+ cleaned_urls .append (url )
71+
72+ return cleaned_urls
73+
74+ def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
75+ """
76+ Extracts URLs from the given text.
77+
78+ Args:
79+ text (str): The text to extract URLs from.
80+
81+ Returns:
82+ Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83+ """
84+ # Return empty lists if the URLs are not to be parsed
85+ if not self .parse_urls :
86+ return [], []
87+
88+ # Regular expression to find URLs (both links and images)
89+ image_extensions = default_filters .filter_dict ["img_exts" ]
90+ image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
91+ url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
92+
93+ # Find all URLs in the string
94+ all_urls = url_pattern .findall (text )
95+ all_urls = self ._clean_urls (all_urls )
96+
97+ if not source .startswith ("http" ):
98+ # Remove any URLs that is not complete
99+ all_urls = [url for url in all_urls if url .startswith ("http" )]
100+ else :
101+ # Add to local URLs the source URL
102+ all_urls = [urljoin (source , url ) for url in all_urls ]
103+
104+ images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
105+ links = [url for url in all_urls if url not in images ]
106+
107+ return links , images
44108
45109 def execute (self , state : dict ) -> dict :
46110 """
@@ -63,7 +127,9 @@ def execute(self, state: dict) -> dict:
63127 input_keys = self .get_input_keys (state )
64128
65129 input_data = [state [key ] for key in input_keys ]
130+
66131 docs_transformed = input_data [0 ]
132+ source = input_data [1 ] if self .parse_urls else None
67133
68134 def count_tokens (text ):
69135 from ..utils import token_count
@@ -73,12 +139,17 @@ def count_tokens(text):
73139 docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
74140 docs_transformed = docs_transformed [0 ]
75141
142+ link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
143+
76144 chunks = chunk (text = docs_transformed .page_content ,
77145 chunk_size = self .node_config .get ("chunk_size" , 4096 )- 250 ,
78146 token_counter = count_tokens ,
79147 memoize = False )
80148 else :
81149 docs_transformed = docs_transformed [0 ]
150+
151+ link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
152+
82153 chunk_size = self .node_config .get ("chunk_size" , 4096 )
83154 chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
84155
@@ -94,4 +165,8 @@ def count_tokens(text):
94165 memoize = False )
95166
96167 state .update ({self .output [0 ]: chunks })
168+ if self .parse_urls :
169+ state .update ({self .output [1 ]: link_urls })
170+ state .update ({self .output [2 ]: img_urls })
171+
97172 return state
0 commit comments