Skip to content

Commit afe7ed4

Browse files
authored
Merge pull request #36 from MODSetter/dev
Fixed current agent citation issues and added sub_section_writer agen…
2 parents fa5dbb7 + aaddd5c commit afe7ed4

File tree

13 files changed

+565
-81
lines changed

13 files changed

+565
-81
lines changed

surfsense_backend/app/agents/__init__.py

Whitespace-only changes.

surfsense_backend/app/agents/researcher/__init__.py

Whitespace-only changes.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
"""New LangGraph Agent.
2+
3+
This module defines a custom graph.
4+
"""
5+
6+
from .graph import graph
7+
8+
__all__ = ["graph"]
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
"""Define the configurable parameters for the agent."""
2+
3+
from __future__ import annotations
4+
5+
from dataclasses import dataclass, fields
6+
from typing import Optional, List
7+
8+
from langchain_core.runnables import RunnableConfig
9+
10+
11+
@dataclass(kw_only=True)
12+
class Configuration:
13+
"""The configuration for the agent."""
14+
15+
# Input parameters provided at invocation
16+
sub_section_title: str
17+
sub_questions: List[str]
18+
connectors_to_search: List[str]
19+
user_id: str
20+
search_space_id: int
21+
top_k: int = 20 # Default top_k value
22+
23+
24+
@classmethod
25+
def from_runnable_config(
26+
cls, config: Optional[RunnableConfig] = None
27+
) -> Configuration:
28+
"""Create a Configuration instance from a RunnableConfig object."""
29+
configurable = (config.get("configurable") or {}) if config else {}
30+
_fields = {f.name for f in fields(cls) if f.init}
31+
return cls(**{k: v for k, v in configurable.items() if k in _fields})
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
from langgraph.graph import StateGraph
2+
from .state import State
3+
from .nodes import fetch_relevant_documents, write_sub_section
4+
from .configuration import Configuration
5+
6+
# Define a new graph
7+
workflow = StateGraph(State, config_schema=Configuration)
8+
9+
# Add the nodes to the graph
10+
workflow.add_node("fetch_relevant_documents", fetch_relevant_documents)
11+
workflow.add_node("write_sub_section", write_sub_section)
12+
13+
# Entry point
14+
workflow.add_edge("__start__", "fetch_relevant_documents")
15+
# Connect fetch_relevant_documents to write_sub_section
16+
workflow.add_edge("fetch_relevant_documents", "write_sub_section")
17+
# Exit point
18+
workflow.add_edge("write_sub_section", "__end__")
19+
20+
# Compile the workflow into an executable graph
21+
graph = workflow.compile()
22+
graph.name = "Sub Section Writer" # This defines the custom name in LangSmith
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
from .configuration import Configuration
2+
from langchain_core.runnables import RunnableConfig
3+
from .state import State
4+
from typing import Any, Dict
5+
from app.utils.connector_service import ConnectorService
6+
from app.utils.reranker_service import RerankerService
7+
from app.config import config as app_config
8+
from .prompts import citation_system_prompt
9+
from langchain_core.messages import HumanMessage, SystemMessage
10+
11+
async def fetch_relevant_documents(state: State, config: RunnableConfig) -> Dict[str, Any]:
12+
"""
13+
Fetch relevant documents for the sub-section using specified connectors.
14+
15+
This node retrieves documents from various data sources based on the sub-questions
16+
derived from the sub-section title. It searches across all selected connectors
17+
(YouTube, Extension, Crawled URLs, Files, Tavily API, Slack, Notion) and reranks
18+
the results to provide the most relevant information for the agent workflow.
19+
20+
Returns:
21+
Dict containing the reranked documents in the "relevant_documents_fetched" key.
22+
"""
23+
# Get configuration
24+
configuration = Configuration.from_runnable_config(config)
25+
26+
# Extract state parameters
27+
db_session = state.db_session
28+
29+
# Extract config parameters
30+
user_id = configuration.user_id
31+
search_space_id = configuration.search_space_id
32+
TOP_K = configuration.top_k
33+
34+
# Initialize services
35+
connector_service = ConnectorService(db_session)
36+
reranker_service = RerankerService.get_reranker_instance(app_config)
37+
38+
all_raw_documents = [] # Store all raw documents before reranking
39+
40+
for user_query in configuration.sub_questions:
41+
# Reformulate query (optional, consider if needed for each sub-question)
42+
# reformulated_query = await QueryService.reformulate_query(user_query)
43+
reformulated_query = user_query # Using original sub-question for now
44+
45+
# Process each selected connector
46+
for connector in configuration.connectors_to_search:
47+
if connector == "YOUTUBE_VIDEO":
48+
_, youtube_chunks = await connector_service.search_youtube(
49+
user_query=reformulated_query,
50+
user_id=user_id,
51+
search_space_id=search_space_id,
52+
top_k=TOP_K
53+
)
54+
all_raw_documents.extend(youtube_chunks)
55+
56+
elif connector == "EXTENSION":
57+
_, extension_chunks = await connector_service.search_extension(
58+
user_query=reformulated_query,
59+
user_id=user_id,
60+
search_space_id=search_space_id,
61+
top_k=TOP_K
62+
)
63+
all_raw_documents.extend(extension_chunks)
64+
65+
elif connector == "CRAWLED_URL":
66+
_, crawled_urls_chunks = await connector_service.search_crawled_urls(
67+
user_query=reformulated_query,
68+
user_id=user_id,
69+
search_space_id=search_space_id,
70+
top_k=TOP_K
71+
)
72+
all_raw_documents.extend(crawled_urls_chunks)
73+
74+
elif connector == "FILE":
75+
_, files_chunks = await connector_service.search_files(
76+
user_query=reformulated_query,
77+
user_id=user_id,
78+
search_space_id=search_space_id,
79+
top_k=TOP_K
80+
)
81+
all_raw_documents.extend(files_chunks)
82+
83+
elif connector == "TAVILY_API":
84+
_, tavily_chunks = await connector_service.search_tavily(
85+
user_query=reformulated_query,
86+
user_id=user_id,
87+
top_k=TOP_K
88+
)
89+
all_raw_documents.extend(tavily_chunks)
90+
91+
elif connector == "SLACK_CONNECTOR":
92+
_, slack_chunks = await connector_service.search_slack(
93+
user_query=reformulated_query,
94+
user_id=user_id,
95+
search_space_id=search_space_id,
96+
top_k=TOP_K
97+
)
98+
all_raw_documents.extend(slack_chunks)
99+
100+
elif connector == "NOTION_CONNECTOR":
101+
_, notion_chunks = await connector_service.search_notion(
102+
user_query=reformulated_query,
103+
user_id=user_id,
104+
search_space_id=search_space_id,
105+
top_k=TOP_K
106+
)
107+
all_raw_documents.extend(notion_chunks)
108+
109+
# If we have documents and a reranker is available, rerank them
110+
# Deduplicate documents based on chunk_id or content to avoid processing duplicates
111+
seen_chunk_ids = set()
112+
seen_content_hashes = set()
113+
deduplicated_docs = []
114+
115+
for doc in all_raw_documents:
116+
chunk_id = doc.get("chunk_id")
117+
content = doc.get("content", "")
118+
content_hash = hash(content)
119+
120+
# Skip if we've seen this chunk_id or content before
121+
if (chunk_id and chunk_id in seen_chunk_ids) or content_hash in seen_content_hashes:
122+
continue
123+
124+
# Add to our tracking sets and keep this document
125+
if chunk_id:
126+
seen_chunk_ids.add(chunk_id)
127+
seen_content_hashes.add(content_hash)
128+
deduplicated_docs.append(doc)
129+
130+
# Use deduplicated documents for reranking
131+
reranked_docs = deduplicated_docs
132+
if deduplicated_docs and reranker_service:
133+
# Use the main sub_section_title for reranking context
134+
rerank_query = configuration.sub_section_title
135+
136+
# Convert documents to format expected by reranker
137+
reranker_input_docs = [
138+
{
139+
"chunk_id": doc.get("chunk_id", f"chunk_{i}"),
140+
"content": doc.get("content", ""),
141+
"score": doc.get("score", 0.0),
142+
"document": {
143+
"id": doc.get("document", {}).get("id", ""),
144+
"title": doc.get("document", {}).get("title", ""),
145+
"document_type": doc.get("document", {}).get("document_type", ""),
146+
"metadata": doc.get("document", {}).get("metadata", {})
147+
}
148+
} for i, doc in enumerate(deduplicated_docs)
149+
]
150+
151+
# Rerank documents using the main title query
152+
reranked_docs = reranker_service.rerank_documents(rerank_query, reranker_input_docs)
153+
154+
# Sort by score in descending order
155+
reranked_docs.sort(key=lambda x: x.get("score", 0), reverse=True)
156+
157+
# Update state with fetched documents
158+
return {
159+
"relevant_documents_fetched": reranked_docs
160+
}
161+
162+
163+
164+
async def write_sub_section(state: State, config: RunnableConfig) -> Dict[str, Any]:
165+
"""
166+
Write the sub-section using the fetched documents.
167+
168+
This node takes the relevant documents fetched in the previous node and uses
169+
an LLM to generate a comprehensive answer to the sub-section questions with
170+
proper citations. The citations follow IEEE format using source IDs from the
171+
documents.
172+
173+
Returns:
174+
Dict containing the final answer in the "final_answer" key.
175+
"""
176+
177+
# Get configuration and relevant documents
178+
configuration = Configuration.from_runnable_config(config)
179+
documents = state.relevant_documents_fetched
180+
181+
# Initialize LLM
182+
llm = app_config.fast_llm_instance
183+
184+
# If no documents were found, return a message indicating this
185+
if not documents or len(documents) == 0:
186+
return {
187+
"final_answer": "No relevant documents were found to answer this question. Please try refining your search or providing more specific questions."
188+
}
189+
190+
# Prepare documents for citation formatting
191+
formatted_documents = []
192+
for i, doc in enumerate(documents):
193+
# Extract content and metadata
194+
content = doc.get("content", "")
195+
doc_info = doc.get("document", {})
196+
document_id = doc_info.get("id", f"{i+1}") # Use document ID or index+1 as source_id
197+
198+
# Format document according to the citation system prompt's expected format
199+
formatted_doc = f"""
200+
<document>
201+
<metadata>
202+
<source_id>{document_id}</source_id>
203+
</metadata>
204+
<content>
205+
{content}
206+
</content>
207+
</document>
208+
"""
209+
formatted_documents.append(formatted_doc)
210+
211+
# Create the query that combines the section title and questions
212+
# section_title = configuration.sub_section_title
213+
questions = "\n".join([f"- {q}" for q in configuration.sub_questions])
214+
documents_text = "\n".join(formatted_documents)
215+
216+
# Construct a clear, structured query for the LLM
217+
human_message_content = f"""
218+
Please write a comprehensive answer for the title:
219+
220+
Address the following questions:
221+
<questions>
222+
{questions}
223+
</questions>
224+
225+
Use the provided documents as your source material and cite them properly using the IEEE citation format [X] where X is the source_id.
226+
<documents>
227+
{documents_text}
228+
</documents>
229+
"""
230+
231+
# Create messages for the LLM
232+
messages = [
233+
SystemMessage(content=citation_system_prompt),
234+
HumanMessage(content=human_message_content)
235+
]
236+
237+
# Call the LLM and get the response
238+
response = await llm.ainvoke(messages)
239+
final_answer = response.content
240+
241+
return {
242+
"final_answer": final_answer
243+
}
244+
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
citation_system_prompt = f"""
2+
You are a research assistant tasked with analyzing documents and providing comprehensive answers with proper citations in IEEE format.
3+
4+
<instructions>
5+
1. Carefully analyze all provided documents in the <document> section's.
6+
2. Extract relevant information that addresses the user's query.
7+
3. Synthesize a comprehensive, well-structured answer using information from these documents.
8+
4. For EVERY piece of information you include from the documents, add an IEEE-style citation in square brackets [X] where X is the source_id from the document's metadata.
9+
5. Make sure ALL factual statements from the documents have proper citations.
10+
6. If multiple documents support the same point, include all relevant citations [X], [Y].
11+
7. Present information in a logical, coherent flow.
12+
8. Use your own words to connect ideas, but cite ALL information from the documents.
13+
9. If documents contain conflicting information, acknowledge this and present both perspectives with appropriate citations.
14+
10. Do not make up or include information not found in the provided documents.
15+
11. CRITICAL: You MUST use the exact source_id value from each document's metadata for citations. Do not create your own citation numbers.
16+
12. CRITICAL: Every citation MUST be in the IEEE format [X] where X is the exact source_id value.
17+
13. CRITICAL: Never renumber or reorder citations - always use the original source_id values.
18+
14. CRITICAL: Do not return citations as clickable links.
19+
15. CRITICAL: Never format citations as markdown links like "([1](https://example.com))". Always use plain square brackets only.
20+
16. CRITICAL: Citations must ONLY appear as [X] or [X], [Y], [Z] format - never with parentheses, hyperlinks, or other formatting.
21+
17. CRITICAL: Never make up citation numbers. Only use source_id values that are explicitly provided in the document metadata.
22+
18. CRITICAL: If you are unsure about a source_id, do not include a citation rather than guessing or making one up.
23+
</instructions>
24+
25+
<format>
26+
- Write in clear, professional language suitable for academic or technical audiences
27+
- Organize your response with appropriate paragraphs, headings, and structure
28+
- Every fact from the documents must have an IEEE-style citation in square brackets [X] where X is the EXACT source_id from the document's metadata
29+
- Citations should appear at the end of the sentence containing the information they support
30+
- Multiple citations should be separated by commas: [X], [Y], [Z]
31+
- No need to return references section. Just citation numbers in answer.
32+
- NEVER create your own citation numbering system - use the exact source_id values from the documents.
33+
- NEVER format citations as clickable links or as markdown links like "([1](https://example.com))". Always use plain square brackets only.
34+
- NEVER make up citation numbers if you are unsure about the source_id. It is better to omit the citation than to guess.
35+
</format>
36+
37+
<input_example>
38+
<document>
39+
<metadata>
40+
<source_id>1</source_id>
41+
</metadata>
42+
<content>
43+
The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia. It comprises over 2,900 individual reefs and 900 islands.
44+
</content>
45+
</document>
46+
47+
<document>
48+
<metadata>
49+
<source_id>13</source_id>
50+
</metadata>
51+
<content>
52+
Climate change poses a significant threat to coral reefs worldwide. Rising ocean temperatures have led to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020.
53+
</content>
54+
</document>
55+
56+
<document>
57+
<metadata>
58+
<source_id>21</source_id>
59+
</metadata>
60+
<content>
61+
The Great Barrier Reef was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity. It is home to over 1,500 species of fish and 400 types of coral.
62+
</content>
63+
</document>
64+
</input_example>
65+
66+
<output_example>
67+
The Great Barrier Reef is the world's largest coral reef system, stretching over 2,300 kilometers along the coast of Queensland, Australia [1]. It was designated a UNESCO World Heritage Site in 1981 due to its outstanding universal value and biological diversity [21]. The reef is home to over 1,500 species of fish and 400 types of coral [21]. Unfortunately, climate change poses a significant threat to coral reefs worldwide, with rising ocean temperatures leading to mass coral bleaching events in the Great Barrier Reef in 2016, 2017, and 2020 [13]. The reef system comprises over 2,900 individual reefs and 900 islands [1], making it an ecological treasure that requires protection from multiple threats [1], [13].
68+
</output_example>
69+
70+
<incorrect_citation_formats>
71+
DO NOT use any of these incorrect citation formats:
72+
- Using parentheses and markdown links: ([1](https://github.com/MODSetter/SurfSense))
73+
- Using parentheses around brackets: ([1])
74+
- Using hyperlinked text: [link to source 1](https://example.com)
75+
- Using footnote style: ... reef system¹
76+
- Making up citation numbers when source_id is unknown
77+
78+
ONLY use plain square brackets [1] or multiple citations [1], [2], [3]
79+
</incorrect_citation_formats>
80+
81+
Note that the citation numbers match exactly with the source_id values (1, 13, and 21) and are not renumbered sequentially. Citations follow IEEE style with square brackets and appear at the end of sentences.
82+
"""

0 commit comments

Comments
 (0)