44
55# Imports from standard library
66from typing import List , Optional
7+ import re
78from tqdm import tqdm
89
910# Imports from Langchain
2021class SearchLinkNode (BaseNode ):
2122 """
2223 A node that can filter out the relevant links in the webpage content for the user prompt.
23- Node expects the aleready scrapped links on the webpage and hence it is expected
24+ Node expects the already scrapped links on the webpage and hence it is expected
2425 that this node be used after the FetchNode.
2526
2627 Attributes:
@@ -74,32 +75,6 @@ def execute(self, state: dict) -> dict:
7475 parsed_content_chunks = state [input_keys [1 ]]
7576 output_parser = JsonOutputParser ()
7677
77- prompt_relevant_links = """
78- You are a website scraper and you have just scraped the following content from a website.
79- Content: {content}
80-
81- You are now tasked with identifying all hyper links within the content that are potentially
82- relevant to the user task: {user_prompt}
83-
84- Assume relevance broadly, including any links that might be related or potentially useful
85- in relation to the task.
86-
87- Sort it in order of importance, the first one should be the most important one, the last one
88- the least important
89-
90- Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
91- whether the content at the link is directly relevant.
92-
93- Output only a list of relevant links in the format:
94- [
95- "link1",
96- "link2",
97- "link3",
98- .
99- .
100- .
101- ]
102- """
10378 relevant_links = []
10479
10580 for i , chunk in enumerate (
@@ -109,15 +84,49 @@ def execute(self, state: dict) -> dict:
10984 disable = not self .verbose ,
11085 )
11186 ):
112- merge_prompt = PromptTemplate (
113- template = prompt_relevant_links ,
114- input_variables = ["content" , "user_prompt" ],
115- )
116- merge_chain = merge_prompt | self .llm_model | output_parser
117- # merge_chain = merge_prompt | self.llm_model
118- answer = merge_chain .invoke (
119- {"content" : chunk .page_content , "user_prompt" : user_prompt }
120- )
121- relevant_links += answer
87+ try :
88+ # Primary approach: Regular expression to extract links
89+ links = re .findall (r'(https?://\S+)' , chunk .page_content )
90+ relevant_links += links
91+ except Exception as e :
92+ # Fallback approach: Using the LLM to extract links
93+ self .logger .error (f"Error extracting links: { e } . Falling back to LLM." )
94+ prompt_relevant_links = """
95+ You are a website scraper and you have just scraped the following content from a website.
96+ Content: {content}
97+
98+ You are now tasked with identifying all hyper links within the content that are potentially
99+ relevant to the user task: {user_prompt}
100+
101+ Assume relevance broadly, including any links that might be related or potentially useful
102+ in relation to the task.
103+
104+ Sort it in order of importance, the first one should be the most important one, the last one
105+ the least important
106+
107+ Please list only valid URLs and make sure to err on the side of inclusion if it's uncertain
108+ whether the content at the link is directly relevant.
109+
110+ Output only a list of relevant links in the format:
111+ [
112+ "link1",
113+ "link2",
114+ "link3",
115+ .
116+ .
117+ .
118+ ]
119+ """
120+
121+ merge_prompt = PromptTemplate (
122+ template = prompt_relevant_links ,
123+ input_variables = ["content" , "user_prompt" ],
124+ )
125+ merge_chain = merge_prompt | self .llm_model | output_parser
126+ answer = merge_chain .invoke (
127+ {"content" : chunk .page_content , "user_prompt" : user_prompt }
128+ )
129+ relevant_links += answer
130+
122131 state .update ({self .output [0 ]: relevant_links })
123132 return state
0 commit comments