|
16 | 16 |
|
17 | 17 | ## CHAT SETUP |
18 | 18 | CHAT_MAX_TOKENS = 1000 |
19 | | -CHAT_SEARCH_KWARG_K = 5 |
| 19 | +CHAT_SEARCH_KWARG_K = 3 |
20 | 20 | CHAT_SEARCH_KWARG_SCORE_THRESHOLD = 0.7 |
21 | 21 | CHAT_DOC_SPLIT_SIZE = 3000 |
22 | | -CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.15 |
| 22 | +CHAT_EMBEDDING_FILTER_SCORE_THRESHOLD = 0.10 |
23 | 23 | CHAT_TOKEN_CUT_OFF = { |
24 | | - "gpt-3.5": 5, |
25 | | - "gemini-1.0-pro": 5, |
26 | | - "gemini-1.5-pro": 10, |
27 | | - "gpt-4": 10, |
28 | | - "diffbot" : 10, |
29 | | - "gpt-4o": 10, |
30 | | - "groq-llama3" : 5 |
| 24 | + ("gpt-3.5","gemini-1.0-pro","gemini-1.5-pro","groq-llama3" ) : 4, |
| 25 | + ("gpt-4","diffbot" , "gpt-4o") : 28 |
31 | 26 | } |
| 27 | + |
| 28 | + |
32 | 29 | ### CHAT TEMPLATES |
33 | 30 | CHAT_SYSTEM_TEMPLATE = """ |
34 | 31 | You are an AI-powered question-answering agent. Your task is to provide accurate and comprehensive responses to user queries based on the given context, chat history, and available resources. |
|
90 | 87 | {source: COALESCE(CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} as metadata |
91 | 88 | """ |
92 | 89 |
|
93 | | -VECTOR_GRAPH_SEARCH_QUERY=""" |
| 90 | +# VECTOR_GRAPH_SEARCH_QUERY=""" |
| 91 | +# WITH node as chunk, score |
| 92 | +# MATCH (chunk)-[:PART_OF]->(d:Document) |
| 93 | +# CALL { WITH chunk |
| 94 | +# MATCH (chunk)-[:HAS_ENTITY]->(e) |
| 95 | +# MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) |
| 96 | +# UNWIND rels as r |
| 97 | +# RETURN collect(distinct r) as rels |
| 98 | +# } |
| 99 | +# WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels |
| 100 | +# WITH d, avg_score, |
| 101 | +# [c IN chunks | c.chunk.text] AS texts, |
| 102 | +# [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails, |
| 103 | +# [r in rels | coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ startNode(r).id + " "+ type(r) + " " + coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id] as entities |
| 104 | +# WITH d, avg_score,chunkdetails, |
| 105 | +# apoc.text.join(texts,"\n----\n") + |
| 106 | +# apoc.text.join(entities,"\n") |
| 107 | +# as text |
| 108 | +# RETURN text, avg_score AS score, {source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata |
| 109 | +# """ |
| 110 | + |
| 111 | + |
| 112 | +VECTOR_GRAPH_SEARCH_QUERY = """ |
94 | 113 | WITH node as chunk, score |
| 114 | +// find the document of the chunk |
95 | 115 | MATCH (chunk)-[:PART_OF]->(d:Document) |
| 116 | +// fetch entities |
96 | 117 | CALL { WITH chunk |
| 118 | +// entities connected to the chunk |
| 119 | +// todo only return entities that are actually in the chunk, remember we connect all extracted entities to all chunks |
97 | 120 | MATCH (chunk)-[:HAS_ENTITY]->(e) |
98 | | -MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) |
99 | | -UNWIND rels as r |
100 | | -RETURN collect(distinct r) as rels |
| 121 | +
|
| 122 | +// depending on match to query embedding either 1 or 2 step expansion |
| 123 | +WITH CASE WHEN true // vector.similarity.cosine($embedding, e.embedding ) <= 0.95 |
| 124 | +THEN |
| 125 | +collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,1}(:!Chunk&!Document) RETURN path } |
| 126 | +ELSE |
| 127 | +collect { MATCH path=(e)(()-[rels:!HAS_ENTITY&!PART_OF]-()){0,2}(:!Chunk&!Document) RETURN path } |
| 128 | +END as paths |
| 129 | +
|
| 130 | +RETURN collect{ unwind paths as p unwind relationships(p) as r return distinct r} as rels, |
| 131 | +collect{ unwind paths as p unwind nodes(p) as n return distinct n} as nodes |
101 | 132 | } |
102 | | -WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels |
| 133 | +// aggregate chunk-details and de-duplicate nodes and relationships |
| 134 | +WITH d, collect(DISTINCT {chunk: chunk, score: score}) AS chunks, avg(score) as avg_score, apoc.coll.toSet(apoc.coll.flatten(collect(rels))) as rels, |
| 135 | +
|
| 136 | +// TODO sort by relevancy (embeddding comparision?) cut off after X (e.g. 25) nodes? |
| 137 | +apoc.coll.toSet(apoc.coll.flatten(collect( |
| 138 | + [r in rels |[startNode(r),endNode(r)]]),true)) as nodes |
| 139 | +
|
| 140 | +// generate metadata and text components for chunks, nodes and relationships |
103 | 141 | WITH d, avg_score, |
104 | 142 | [c IN chunks | c.chunk.text] AS texts, |
105 | 143 | [c IN chunks | {id: c.chunk.id, score: c.score}] AS chunkdetails, |
106 | | - [r in rels | coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ startNode(r).id + " "+ type(r) + " " + coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + endNode(r).id] as entities |
| 144 | + apoc.coll.sort([n in nodes | |
| 145 | +
|
| 146 | +coalesce(apoc.coll.removeAll(labels(n),['__Entity__'])[0],"") +":"+ |
| 147 | +n.id + (case when n.description is not null then " ("+ n.description+")" else "" end)]) as nodeTexts, |
| 148 | + apoc.coll.sort([r in rels |
| 149 | + // optional filter if we limit the node-set |
| 150 | + // WHERE startNode(r) in nodes AND endNode(r) in nodes |
| 151 | + | |
| 152 | +coalesce(apoc.coll.removeAll(labels(startNode(r)),['__Entity__'])[0],"") +":"+ |
| 153 | +startNode(r).id + |
| 154 | +" " + type(r) + " " + |
| 155 | +coalesce(apoc.coll.removeAll(labels(endNode(r)),['__Entity__'])[0],"") +":" + |
| 156 | +endNode(r).id |
| 157 | +]) as relTexts |
| 158 | +
|
| 159 | +// combine texts into response-text |
107 | 160 | WITH d, avg_score,chunkdetails, |
| 161 | +"Text Content:\n" + |
108 | 162 | apoc.text.join(texts,"\n----\n") + |
109 | | -apoc.text.join(entities,"\n") |
| 163 | +"\n----\nEntities:\n"+ |
| 164 | +apoc.text.join(nodeTexts,"\n") + |
| 165 | +"\n----\nRelationships:\n"+ |
| 166 | +apoc.text.join(relTexts,"\n") |
| 167 | +
|
110 | 168 | as text |
111 | | -RETURN text, avg_score AS score, {source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata |
112 | | -""" |
| 169 | +RETURN text, avg_score as score, {length:size(text), source: COALESCE( CASE WHEN d.url CONTAINS "None" THEN d.fileName ELSE d.url END, d.fileName), chunkdetails: chunkdetails} AS metadata |
| 170 | +""" |
113 | 171 |
|
114 | 172 |
|
115 | 173 |
|
|
0 commit comments