Skip to content

Commit 2f8f9d8

Browse files
Modified retriver query (#491)
1 parent e7d2bab commit 2f8f9d8

File tree

2 files changed

+82
-19
lines changed

2 files changed

+82
-19
lines changed

backend/src/QA_integration_new.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -90,9 +90,13 @@ def create_neo4j_chat_message_history(graph, session_id):
9090
return None
9191

9292
def format_documents(documents,model):
93+
prompt_token_cutoff = 4
94+
for models,value in CHAT_TOKEN_CUT_OFF.items():
95+
if model in models:
96+
prompt_token_cutoff = value
9397

9498
sorted_documents = sorted(documents, key=lambda doc: doc.state["query_similarity_score"], reverse=True)
95-
sorted_documents = sorted_documents[:CHAT_TOKEN_CUT_OFF[model]]
99+
sorted_documents = sorted_documents[:prompt_token_cutoff]
96100

97101
formatted_docs = []
98102
sources = set()
@@ -227,6 +231,7 @@ def summarize_and_log(history, messages, llm):
227231

228232
def QA_RAG(graph, model, question, session_id, mode):
229233
try:
234+
logging.info(f"Chat Mode : {mode}")
230235
if mode == "vector":
231236
retrieval_query = VECTOR_SEARCH_QUERY
232237
elif mode == "graph":

backend/src/shared/constants.py

Lines changed: 76 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,16 @@
1616

1717
## CHAT SETUP
1818
CHAT_MAX_TOKENS = 1000
19-
CHAT_SEARCH_KWARG_K = 5
19+
CHAT_SEARCH_KWARG_K = 3
2020
CHAT_SEARCH_KWARG_SCORE_THRESHOLD = 0.7
2121
CHAT_DOC_SPLIT_SIZE = 3000
22-
CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.15
22+
CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.10
2323
CHAT_TOKEN_CUT_OFF = {
24-
"gpt-3.5": 5,
25-
"gemini-1.0-pro": 5,
26-
"gemini-1.5-pro": 10,
27-
"gpt-4": 10,
28-
"diffbot" : 10,
29-
"gpt-4o": 10,
30-
"groq-llama3" : 5
24+
("gpt-3.5","gemini-1.0-pro","gemini-1.5-pro","groq-llama3" ) : 4,
25+
("gpt-4","diffbot" , "gpt-4o") : 28
3126
}
27+
28+
3229
### CHAT TEMPLATES
3330
CHAT_SYSTEM_TEMPLATE = """
3431
You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources.
@@ -90,26 +87,87 @@
9087
{source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} as metadata
9188
"""
9289

93-
VECTOR_GRAPH_SEARCH_QUERY="""
90+
# VECTOR_GRAPH_SEARCH_QUERY="""
91+
# WITH node as chunk, score
92+
# MATCH (chunk)-[:PART_OF]->(d:Document)
93+
# CALL { WITH chunk
94+
# MATCH (chunk)-[:HAS_ENTITY]->(e)
95+
# MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
96+
# UNWIND rels as r
97+
# RETURN collect(distinct r) as rels
98+
# }
99+
# WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels
100+
# WITH d, avg_score,
101+
# [c IN chunks | c.chunk.text] AS texts,
102+
# [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,
103+
# [r in rels | coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ startNode(r).id + " "+ type(r) + " " + coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id] as entities
104+
# WITH d, avg_score,chunkdetails,
105+
# apoc.text.join(texts,"\n----\n") +
106+
# apoc.text.join(entities,"\n")
107+
# as text
108+
# RETURN text, avg_score AS score, {source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
109+
# """
110+
111+
112+
VECTOR_GRAPH_SEARCH_QUERY = """
94113
WITH node as chunk, score
114+
// find the document of the chunk
95115
MATCH (chunk)-[:PART_OF]->(d:Document)
116+
// fetch entities
96117
CALL { WITH chunk
118+
// entities connected to the chunk
119+
// todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks
97120
MATCH (chunk)-[:HAS_ENTITY]->(e)
98-
MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document)
99-
UNWIND rels as r
100-
RETURN collect(distinct r) as rels
121+
122+
// depending on match to query embedding either 1 or 2 step expansion
123+
WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95
124+
THEN
125+
collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path }
126+
ELSE
127+
collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path }
128+
END as paths
129+
130+
RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels,
131+
collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes
101132
}
102-
WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels
133+
// aggregate chunk-details and de-duplicate nodes and relationships
134+
WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels,
135+
136+
// TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes?
137+
apoc.coll.toSet(apoc.coll.flatten(collect(
138+
[r in rels |[startNode(r),endNode(r)]]),true)) as nodes
139+
140+
// generate metadata and text components for chunks, nodes and relationships
103141
WITH d, avg_score,
104142
[c IN chunks | c.chunk.text] AS texts,
105143
[c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails,
106-
[r in rels | coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ startNode(r).id + " "+ type(r) + " " + coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id] as entities
144+
apoc.coll.sort([n in nodes |
145+
146+
coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+
147+
n.id + (case when n.description is not null then " ("+ n.description+")" else "" end)]) as nodeTexts,
148+
apoc.coll.sort([r in rels
149+
// optional filter if we limit the node-set
150+
// WHERE startNode(r) in nodes AND endNode(r) in nodes
151+
|
152+
coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+
153+
startNode(r).id +
154+
" " + type(r) + " " +
155+
coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" +
156+
endNode(r).id
157+
]) as relTexts
158+
159+
// combine texts into response-text
107160
WITH d, avg_score,chunkdetails,
161+
"Text Content:\n" +
108162
apoc.text.join(texts,"\n----\n") +
109-
apoc.text.join(entities,"\n")
163+
"\n----\nEntities:\n"+
164+
apoc.text.join(nodeTexts,"\n") +
165+
"\n----\nRelationships:\n"+
166+
apoc.text.join(relTexts,"\n")
167+
110168
as text
111-
RETURN text, avg_score AS score, {source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
112-
"""
169+
RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata
170+
"""
113171

114172

115173

0 commit comments

Comments
 (0)