-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmain.py
More file actions
97 lines (89 loc) · 4.17 KB
/
main.py
File metadata and controls
97 lines (89 loc) · 4.17 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from pathlib import Path
from typing import Dict, Tuple, List, TypedDict
import os
import glob
import hashlib
import numpy as np
from interface.chunk import SavedChunk
from chunker import SemanticChunker
from retrievers.symbolic_graph.symbolic_kg import SymbolicGraphBuilder
from retrievers.semantic_cluster.cluster_builder import EmergentSemanticGraphBuilder
from retrievers.symbolic_graph.symbolic_retriver import SymbolicRetriever
from retrievers.semantic_cluster.cluster_retriever import SemanticClusterRetriever
from retrievers.bm25.bm25 import BM25Retriever
from retrievers.faiss.faiss import FaissRetriever
from retrievers.hybrid_retriever.hybrid import HybridRetriever
from planner.plan import Planner
from llm.gemini import GeminiLLM
from storage import ChunkStorage
from storage.embed_store import ChunkEmbeddingStore
from interface.file import FileMeta
def _normalize_path(path_str: str) -> str:
"""Normalizes a path string for consistency."""
return os.path.normpath(path_str).replace('\\', '/')
def scan_files(base_path: str) -> Dict[str, FileMeta]:
file_map: Dict[str, FileMeta] = {}
for filepath in glob.glob(f"{base_path}/**/*", recursive=True):
if os.path.isfile(filepath):
with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
try:
content = f.read()
file_hash = hashlib.sha256(content.encode()).hexdigest()
file_map[filepath] = {
"hash": file_hash,
"relative_path": _normalize_path(str(Path(filepath).relative_to(base_path))),
"project_root": base_path,
}
except Exception as e:
print(f"Skipping {filepath}: {e}")
return file_map
def get_all_chunks(
file_map: Dict[str, FileMeta], storage: ChunkStorage, chunker: SemanticChunker
) -> List[SavedChunk]:
all_chunks: List[SavedChunk] = []
for path, metadata in file_map.items():
cached_chunks = storage.get_chunks_for_file(path, metadata["hash"])
if cached_chunks:
print("Chunks found in sqlite for file:", path)
all_chunks.extend(cached_chunks)
else:
new_chunks = chunker.chunk(path, metadata["hash"])
storage.store_chunks(path, metadata["hash"], new_chunks)
all_chunks.extend(new_chunks)
return all_chunks
if __name__ == "__main__":
BASE_PATH = './sample'
GRAPH_FILE = "symbolic_graph.pkl"
model = GeminiLLM(model_name='gemini-2.5-flash', api_key='AIzaSyBiwNzJHmOa21DwkiB5vdhaI1UmdX33NX0')
filemap = scan_files(BASE_PATH)
chunk_store = ChunkStorage()
chunker = SemanticChunker()
chunks = get_all_chunks(filemap, chunk_store, chunker)
embed_store = ChunkEmbeddingStore(chunks=chunks)
print("Chunks: ",len(chunks))
sym_builder = SymbolicGraphBuilder(graph_path=GRAPH_FILE)
sym_builder.update_graph(chunks, filemap)
print("Graph Stats:", sym_builder.get_graph_stats())
sym_builder.save_graph()
semantic_graph = EmergentSemanticGraphBuilder(graph_path="semantic_graph.pkl", min_cluster_size=4, embedding_store=embed_store)
semantic_graph.update_graph(chunks, filemap)
semantic_graph.save_graph()
#semantic_graph.save_graph()
print("Graph Stats:", semantic_graph.get_graph_stats())
cluster_data = semantic_graph.get_clusters()
print("len of chunks: ",len(chunks)," clusters: ",len(cluster_data))
bm25 = BM25Retriever(saved_chunks=chunks)
faiss = FaissRetriever(embedding_store=embed_store,saved_chunks=chunks)
cluster = SemanticClusterRetriever(graph=semantic_graph.get_graph(), embedding_store=embed_store)
symbolic = SymbolicRetriever(builder=sym_builder)
retrievers = {
"bm25": bm25,
"faiss": faiss,
"cluster": cluster,
"symbolic": symbolic,
}
hyb = HybridRetriever(retrievers=retrievers)
#model = GeminiModel(api_key="AIzaSyCP4cXNCWCOsuc---------------bt4M") #discarded api for test ues
planner = Planner(llm=model, retriever=hyb)
ans = planner.run("Tell me about Election process, and The Indian Evidence Act (1872) from inner_folder folder", top_k=5)
print("Answer: ",ans)