11"""
22ParseNode Module
33"""
4- from typing import Tuple , List , Optional
5- from urllib .parse import urljoin
6- import re
4+ from typing import List , Optional
75from semchunk import chunk
86from langchain_community .document_transformers import Html2TextTransformer
97from langchain_core .documents import Document
108from .base_node import BaseNode
11- from ..helpers import default_filters
129
1310class ParseNode (BaseNode ):
1411 """
@@ -43,60 +40,6 @@ def __init__(
4340 self .parse_html = (
4441 True if node_config is None else node_config .get ("parse_html" , True )
4542 )
46- self .llm_model = node_config ['llm_model' ]
47- self .parse_urls = (
48- False if node_config is None else node_config .get ("parse_urls" , False )
49- )
50-
51- def _clean_urls (self , urls : List [str ]) -> List [str ]:
52- """
53- Cleans the URLs extracted from the text.
54-
55- Args:
56- urls (List[str]): The list of URLs to clean.
57-
58- Returns:
59- List[str]: The cleaned URLs.
60- """
61- cleaned_urls = []
62- for url in urls :
63- url = re .sub (r'.*?\]\(' , '' , url )
64-
65- url = url .rstrip (').' )
66-
67- cleaned_urls .append (url )
68-
69- return cleaned_urls
70-
71- def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
72- """
73- Extracts URLs from the given text.
74-
75- Args:
76- text (str): The text to extract URLs from.
77-
78- Returns:
79- Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
80- """
81- if not self .parse_urls :
82- return [], []
83-
84- image_extensions = default_filters .filter_dict ["img_exts" ]
85- image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
86- url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
87-
88- all_urls = url_pattern .findall (text )
89- all_urls = self ._clean_urls (all_urls )
90-
91- if not source .startswith ("http" ):
92- all_urls = [url for url in all_urls if url .startswith ("http" )]
93- else :
94- all_urls = [urljoin (source , url ) for url in all_urls ]
95-
96- images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
97- links = [url for url in all_urls if url not in images ]
98-
99- return links , images
10043
10144 def execute (self , state : dict ) -> dict :
10245 """
@@ -119,46 +62,33 @@ def execute(self, state: dict) -> dict:
11962 input_keys = self .get_input_keys (state )
12063
12164 input_data = [state [key ] for key in input_keys ]
122-
12365 docs_transformed = input_data [0 ]
124- source = input_data [1 ] if self .parse_urls else None
125-
126- def count_tokens (text ):
127- from ..utils import token_count
128- return token_count (text , self .llm_model .model_name )
12966
13067 if self .parse_html :
131- docs_transformed = Html2TextTransformer (ignore_links = False ).transform_documents (input_data [0 ])
68+ docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
13269 docs_transformed = docs_transformed [0 ]
13370
134- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
135-
13671 chunks = chunk (text = docs_transformed .page_content ,
13772 chunk_size = self .node_config .get ("chunk_size" , 4096 )- 250 ,
138- token_counter = count_tokens ,
73+ token_counter = lambda text : len ( text . split ()) ,
13974 memoize = False )
14075 else :
14176 docs_transformed = docs_transformed [0 ]
14277
143- link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
144-
14578 chunk_size = self .node_config .get ("chunk_size" , 4096 )
14679 chunk_size = min (chunk_size - 500 , int (chunk_size * 0.9 ))
14780
14881 if isinstance (docs_transformed , Document ):
14982 chunks = chunk (text = docs_transformed .page_content ,
15083 chunk_size = chunk_size ,
151- token_counter = count_tokens ,
84+ token_counter = lambda text : len ( text . split ()) ,
15285 memoize = False )
15386 else :
15487 chunks = chunk (text = docs_transformed ,
15588 chunk_size = chunk_size ,
156- token_counter = count_tokens ,
89+ token_counter = lambda text : len ( text . split ()) ,
15790 memoize = False )
15891
15992 state .update ({self .output [0 ]: chunks })
160- if self .parse_urls :
161- state .update ({self .output [1 ]: link_urls })
162- state .update ({self .output [2 ]: img_urls })
16393
16494 return state
0 commit comments