33"""
44from typing import Tuple , List , Optional
55from urllib .parse import urljoin
6+ import re
67from semchunk import chunk
78from langchain_community .document_transformers import Html2TextTransformer
89from langchain_core .documents import Document
910from .base_node import BaseNode
1011from ..helpers import default_filters
1112
12- import re
13-
1413class ParseNode (BaseNode ):
1514 """
1615 A node responsible for parsing HTML content from a document.
@@ -61,14 +60,12 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
6160 """
6261 cleaned_urls = []
6362 for url in urls :
64- # Remove any leading 'thumbnail](' or similar patterns
6563 url = re .sub (r'.*?\]\(' , '' , url )
66-
67- # Remove any trailing parentheses or brackets
64+
6865 url = url .rstrip (').' )
69-
66+
7067 cleaned_urls .append (url )
71-
68+
7269 return cleaned_urls
7370
7471 def extract_urls (self , text : str , source : str ) -> Tuple [List [str ], List [str ]]:
@@ -81,26 +78,21 @@ def extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
8178 Returns:
8279 Tuple[List[str], List[str]]: A tuple containing the extracted link URLs and image URLs.
8380 """
84- # Return empty lists if the URLs are not to be parsed
8581 if not self .parse_urls :
8682 return [], []
87-
88- # Regular expression to find URLs (both links and images)
83+
8984 image_extensions = default_filters .filter_dict ["img_exts" ]
9085 image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
9186 url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
9287
93- # Find all URLs in the string
9488 all_urls = url_pattern .findall (text )
9589 all_urls = self ._clean_urls (all_urls )
9690
9791 if not source .startswith ("http" ):
98- # Remove any URLs that is not complete
9992 all_urls = [url for url in all_urls if url .startswith ("http" )]
10093 else :
101- # Add to local URLs the source URL
10294 all_urls = [urljoin (source , url ) for url in all_urls ]
103-
95+
10496 images = [url for url in all_urls if any (url .endswith (ext ) for ext in image_extensions )]
10597 links = [url for url in all_urls if url not in images ]
10698
@@ -136,7 +128,7 @@ def count_tokens(text):
136128 return token_count (text , self .llm_model .model_name )
137129
138130 if self .parse_html :
139- docs_transformed = Html2TextTransformer ().transform_documents (input_data [0 ])
131+ docs_transformed = Html2TextTransformer (ignore_links = False ).transform_documents (input_data [0 ])
140132 docs_transformed = docs_transformed [0 ]
141133
142134 link_urls , img_urls = self .extract_urls (docs_transformed .page_content , source )
0 commit comments