11"""
22ParseNode Module
33"""
4- from typing import Tuple , List , Optional
5- from urllib .parse import urljoin
4+ from typing import List , Optional
65from semchunk import chunk
76from langchain_community .document_transformers import Html2TextTransformer
87from langchain_core .documents import Document
98from .base_node import BaseNode
10- from ..helpers import default_filters
11-
12- import re
139
1410class ParseNode (BaseNode ):
1511 """
@@ -44,67 +40,6 @@ def __init__(
4440 self .parse_html = (
4541 True if node_config is None else node_config .get ("parse_html" , True )
4642 )
47- self .llm_model = node_config ['llm_model' ]
48- self .parse_urls = (
49- False if node_config is None else node_config .get ("parse_urls" , False )
50- )
51-
52- def _clean_urls (self , urls : List [str ]) -> List [str ]:
53- """
54- Cleans the URLs extracted from the text.
55-
56- Args:
57- urls (List[str]): The list of URLs to clean.
58-
59- Returns:
60- List[str]: The cleaned URLs.
61- """
62- cleaned_urls = []
63- for url in urls :
64- # Remove any leading 'thumbnail](' or similar patterns
65- url = re .sub (r'.*?\]\(' , '' , url )
66-
67- # Remove any trailing parentheses or brackets
68- url = url .rstrip (').' )
69-
70- cleaned_urls .append (url )
71-
72- return cleaned_urls
73-
74- def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
75- """
76- Extracts URLs from the given text.
77-
78- Args:
79- text (str): The text to extract URLs from.
80-
81- Returns:
82- Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
83- """
84- # Return empty lists if the URLs are not to be parsed
85- if not self .parse_urls :
86- return [], []
87-
88- # Regular expression to find URLs (both links and images)
89- image_extensions = default_filters .filter_dict ["img_exts" ]
90- image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
91- url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
92-
93- # Find all URLs in the string
94- all_urls = url_pattern .findall (text )
95- all_urls = self ._clean_urls (all_urls )
96-
97- if not source .startswith ("http" ):
98- # Remove any URLs that is not complete
99- all_urls = [url for url in all_urls if url .startswith ("http" )]
100- else :
101- # Add to local URLs the source URL
102- all_urls = [urljoin (source , url ) for url in all_urls ]
103-
104- images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
105- links = [url for url in all_urls if url not in images ]
106-
107- return links , images
10843
10944 def execute (self , state : dict ) -> dict :
11045 """
@@ -127,46 +62,33 @@ def execute(self, state: dict) -> dict:
12762 input_keys = self .get_input_keys (state )
12863
12964 input_data = [state [key ] for key in input_keys ]
130-
13165 docs_transformed = input_data [0 ]
132- source = input_data [1 ] if self .parse_urls else None
133-
134- def count_tokens (text ):
135- from ..utils import token_count
136- return token_count (text , self .llm_model .model_name )
13766
13867 if self .parse_html :
13968 docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
14069 docs_transformed = docs_transformed [0 ]
14170
142- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
143-
14471 chunks = chunk (text = docs_transformed .page_content ,
14572 chunk_size = self .node_config .get ("chunk_size" , 4096 )- 250 ,
146- token_counter = count_tokens ,
73+ token_counter = lambda text : len ( text . split ()) ,
14774 memoize = False )
14875 else :
14976 docs_transformed = docs_transformed [0 ]
15077
151- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
152-
15378 chunk_size = self .node_config .get ("chunk_size" , 4096 )
15479 chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
15580
15681 if isinstance (docs_transformed , Document ):
15782 chunks = chunk (text = docs_transformed .page_content ,
15883 chunk_size = chunk_size ,
159- token_counter = count_tokens ,
84+ token_counter = lambda text : len ( text . split ()) ,
16085 memoize = False )
16186 else :
16287 chunks = chunk (text = docs_transformed ,
16388 chunk_size = chunk_size ,
164- token_counter = count_tokens ,
89+ token_counter = lambda text : len ( text . split ()) ,
16590 memoize = False )
16691
16792 state .update ({self .output [0 ]: chunks })
168- if self .parse_urls :
169- state .update ({self .output [1 ]: link_urls })
170- state .update ({self .output [2 ]: img_urls })
17193
17294 return state
0 commit comments