@@ -27,6 +27,8 @@ class ParseNode(BaseNode):
2727 node_config (dict): Additional configuration for the node.
2828 node_name (str): The unique identifier name for the node, defaulting to "Parse".
2929 """
30+ url_pattern = re .compile (r"[http[s]?:\/\/]?(www\.)?([-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b[-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)" )
31+ relative_url_pattern = re .compile (r"[\(](/[^\(\)\s]*)" )
3032
3133 def __init__ (
3234 self ,
@@ -123,12 +125,26 @@ def _extract_urls(self, text: str, source: str) -> Tuple[List[str], List[str]]:
123125 return [], []
124126
125127 image_extensions = default_filters .filter_dict ["img_exts" ]
126- image_extension_seq = '|' .join (image_extensions ).replace ('.' ,'' )
127- url_pattern = re .compile (r'(https?://[^\s]+|\S+\.(?:' + image_extension_seq + '))' )
128-
129- all_urls = url_pattern .findall (text )
128+ url = ""
129+ all_urls = set ()
130+
131+ for group in ParseNode .url_pattern .findall (text ):
132+ for el in group :
133+ if el != '' :
134+ url += el
135+ all_urls .add (url )
136+ url = ""
137+
138+ url = ""
139+ for group in ParseNode .relative_url_pattern .findall (text ):
140+ for el in group :
141+ if el not in ['' , '[' , ']' , '(' , ')' , '{' , '}' ]:
142+ url += el
143+ all_urls .add (urljoin (source , url ))
144+ url = ""
145+
146+ all_urls = list (all_urls )
130147 all_urls = self ._clean_urls (all_urls )
131-
132148 if not source .startswith ("http" ):
133149 all_urls = [url for url in all_urls if url .startswith ("http" )]
134150 else :
@@ -151,9 +167,32 @@ def _clean_urls(self, urls: List[str]) -> List[str]:
151167 """
152168 cleaned_urls = []
153169 for url in urls :
154- url = re .sub (r'.*?\]\(' , '' , url )
155- url = url .rstrip (').' )
170+ if not ParseNode ._is_valid_url (url ):
171+ url = re .sub (r'.*?\]\(' , '' , url )
172+ url = re .sub (r'.*?\[\(' , '' , url )
173+ url = re .sub (r'.*?\[\)' , '' , url )
174+ url = re .sub (r'.*?\]\)' , '' , url )
175+ url = re .sub (r'.*?\)\[' , '' , url )
176+ url = re .sub (r'.*?\)\[' , '' , url )
177+ url = re .sub (r'.*?\(\]' , '' , url )
178+ url = re .sub (r'.*?\)\]' , '' , url )
179+ url = url .rstrip (').-' )
180+ if len (url ) > 0 :
181+ cleaned_urls .append (url )
182+
183+ return cleaned_urls
184+
185+ @staticmethod
186+ def _is_valid_url (url : str ) -> bool :
187+ """
188+ CHecks if the URL format is valid.
156189
157- cleaned_urls .append (url )
190+ Args:
191+ url (str): The URL to check.
158192
159- return cleaned_urls
193+ Returns:
194+ bool: True if the URL format is valid, False otherwise
195+ """
196+ if re .fullmatch (ParseNode .url_pattern , url ) is not None :
197+ return True
198+ return False
0 commit comments