44from typing import List , Optional
55import re
66from tqdm import tqdm
7+ from urllib .parse import urlparse , parse_qs
78from langchain .prompts import PromptTemplate
89from langchain_core .output_parsers import JsonOutputParser
910from langchain_core .runnables import RunnableParallel
1011from ..utils .logging import get_logger
1112from .base_node import BaseNode
1213from ..prompts import TEMPLATE_RELEVANT_LINKS
14+ from ..helpers import default_filters
1315
1416
1517class SearchLinkNode (BaseNode ):
@@ -39,10 +41,54 @@ def __init__(
3941 super ().__init__ (node_name , "node" , input , output , 1 , node_config )
4042
4143 self .llm_model = node_config ["llm_model" ]
42- self .verbose = (
43- False if node_config is None else node_config .get ("verbose" , False )
44- )
4544
45+ # Apply filters if filter_links is True or if filter_config is provided
46+ if node_config .get ("filter_links" , False ) or "filter_config" in node_config :
47+ # Merge provided filter config with default filter config for partial configuration
48+ provided_filter_config = node_config .get ("filter_config" , {})
49+ self .filter_config = {** default_filters .filter_dict , ** provided_filter_config }
50+ self .filter_links = True
51+ else :
52+ # Skip filtering if not enabled
53+ self .filter_config = None
54+ self .filter_links = False
55+
56+ self .verbose = node_config .get ("verbose" , False )
57+ self .seen_links = set ()
58+
59+ def _is_same_domain (self , url , domain ):
60+ if not self .filter_links or not self .filter_config .get ("diff_domain_filter" , True ):
61+ return True # Skip the domain filter if not enabled
62+ parsed_url = urlparse (url )
63+ parsed_domain = urlparse (domain )
64+ return parsed_url .netloc == parsed_domain .netloc
65+
66+ def _is_image_url (self , url ):
67+ if not self .filter_links :
68+ return False # Skip image filtering if filtering is not enabled
69+
70+ image_extensions = self .filter_config .get ("img_exts" , [])
71+ return any (url .lower ().endswith (ext ) for ext in image_extensions )
72+
73+ def _is_language_url (self , url ):
74+ if not self .filter_links :
75+ return False # Skip language filtering if filtering is not enabled
76+
77+ lang_indicators = self .filter_config .get ("lang_indicators" , [])
78+ parsed_url = urlparse (url )
79+ query_params = parse_qs (parsed_url .query )
80+
81+ # Check if the URL path or query string indicates a language-specific version
82+ return any (indicator in parsed_url .path .lower () or indicator in query_params for indicator in lang_indicators )
83+
84+ def _is_potentially_irrelevant (self , url ):
85+ if not self .filter_links :
86+ return False # Skip irrelevant URL filtering if filtering is not enabled
87+
88+ irrelevant_keywords = self .filter_config .get ("irrelevant_keywords" , [])
89+ return any (keyword in url .lower () for keyword in irrelevant_keywords )
90+
91+
4692 def execute (self , state : dict ) -> dict :
4793 """
4894 Filter out relevant links from the webpage that are relavant to prompt. Out of the filtered links, also
@@ -64,6 +110,7 @@ def execute(self, state: dict) -> dict:
64110
65111
66112 parsed_content_chunks = state .get ("doc" )
113+ source_url = state .get ("url" ) or state .get ("local_dir" )
67114 output_parser = JsonOutputParser ()
68115
69116 relevant_links = []
@@ -76,10 +123,28 @@ def execute(self, state: dict) -> dict:
76123 )
77124 ):
78125 try :
126+
79127 # Primary approach: Regular expression to extract links
80128 links = re .findall (r'https?://[^\s"<>\]]+' , str (chunk .page_content ))
81129
82- relevant_links += links
130+ if not self .filter_links :
131+ links = list (set (links ))
132+
133+ relevant_links += links
134+ self .seen_links .update (relevant_links )
135+ else :
136+ filtered_links = [
137+ link for link in links
138+ if self ._is_same_domain (link , source_url )
139+ and not self ._is_image_url (link )
140+ and not self ._is_language_url (link )
141+ and not self ._is_potentially_irrelevant (link )
142+ and link not in self .seen_links
143+ ]
144+ filtered_links = list (set (filtered_links ))
145+ relevant_links += filtered_links
146+ self .seen_links .update (relevant_links )
147+
83148 except Exception as e :
84149 # Fallback approach: Using the LLM to extract links
85150 self .logger .error (f"Error extracting links: { e } . Falling back to LLM." )
0 commit comments