@@ -160,20 +160,42 @@ def extract_links(self, html_content: str) -> list:
160160 def get_full_links (self , base_url : str , links : list ) -> list :
161161 """
162162 Converts relative URLs to full URLs based on the base URL.
163+ Filters out non-web links (mailto:, tel:, javascript:, etc.).
163164
164165 Args:
165166 base_url (str): The base URL for resolving relative links.
166167 links (list): A list of links to convert.
167168
168169 Returns:
169- list: A list of full URLs.
170+ list: A list of valid full URLs.
170171 """
172+ # List of invalid URL schemes to filter out
173+ invalid_schemes = {
174+ 'mailto:' , 'tel:' , 'fax:' , 'sms:' , 'callto:' , 'wtai:' , 'javascript:' ,
175+ 'data:' , 'file:' , 'ftp:' , 'irc:' , 'news:' , 'nntp:' , 'feed:' , 'webcal:' ,
176+ 'skype:' , 'im:' , 'mtps:' , 'spotify:' , 'steam:' , 'teamspeak:' , 'udp:' ,
177+ 'unreal:' , 'ut2004:' , 'ventrilo:' , 'view-source:' , 'ws:' , 'wss:'
178+ }
179+
171180 full_links = []
172181 for link in links :
173- if self .only_inside_links and link .startswith ("http" ):
182+ # Skip if link starts with any invalid scheme
183+ if any (link .lower ().startswith (scheme ) for scheme in invalid_schemes ):
174184 continue
175- full_link = link if link .startswith ("http" ) else urljoin (base_url , link )
176- full_links .append (full_link )
185+
186+ # Skip if it's an external link and only_inside_links is True
187+ if self .only_inside_links and link .startswith (('http://' , 'https://' )):
188+ continue
189+
190+ # Convert relative URLs to absolute URLs
191+ try :
192+ full_link = link if link .startswith (('http://' , 'https://' )) else urljoin (base_url , link )
193+ # Ensure the final URL starts with http:// or https://
194+ if full_link .startswith (('http://' , 'https://' )):
195+ full_links .append (full_link )
196+ except Exception as e :
197+ self .logger .warning (f"Failed to process link { link } : { str (e )} " )
198+
177199 return full_links
178200
179201 def obtain_content (self , documents : List , loader_kwargs ) -> List :
@@ -191,7 +213,11 @@ def obtain_content(self, documents: List, loader_kwargs) -> List:
191213 for doc in documents :
192214 source = doc ["source" ]
193215 if "document" not in doc :
194- document = self .fetch_content (source , loader_kwargs )
216+ try :
217+ document = self .fetch_content (source , loader_kwargs )
218+ except Exception as e :
219+ self .logger .warning (f"Failed to fetch content for { source } : { str (e )} " )
220+ continue
195221
196222 if not document or not document [0 ].page_content .strip ():
197223 self .logger .warning (f"Failed to fetch content for { source } " )
0 commit comments