NSHG-RAG/main.py at main · SrabanMondal/NSHG-RAG · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
from pathlib import Path
from typing import  Dict, Tuple, List, TypedDict
import os
import glob
import hashlib
import numpy as np

from interface.chunk import SavedChunk
from chunker import SemanticChunker
from retrievers.symbolic_graph.symbolic_kg import SymbolicGraphBuilder
from retrievers.semantic_cluster.cluster_builder import EmergentSemanticGraphBuilder
from retrievers.symbolic_graph.symbolic_retriver import SymbolicRetriever
from retrievers.semantic_cluster.cluster_retriever import SemanticClusterRetriever
from retrievers.bm25.bm25 import BM25Retriever
from retrievers.faiss.faiss import FaissRetriever
from retrievers.hybrid_retriever.hybrid import HybridRetriever
from planner.plan import Planner
from llm.gemini import GeminiLLM
from storage import ChunkStorage
from storage.embed_store import ChunkEmbeddingStore
from interface.file import FileMeta

def _normalize_path(path_str: str) -> str:
    """Normalizes a path string for consistency."""
    return os.path.normpath(path_str).replace('\\', '/')

def scan_files(base_path: str) -> Dict[str, FileMeta]:
    file_map: Dict[str, FileMeta] = {}
    for filepath in glob.glob(f"{base_path}/**/*", recursive=True):
        if os.path.isfile(filepath):
            with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
                try:
                    content = f.read()
                    file_hash = hashlib.sha256(content.encode()).hexdigest()
                    file_map[filepath] = {
                        "hash": file_hash,
                        "relative_path": _normalize_path(str(Path(filepath).relative_to(base_path))),
                        "project_root": base_path,
                    }
                except Exception as e:
                    print(f"Skipping {filepath}: {e}")
    return file_map


def get_all_chunks(
    file_map: Dict[str, FileMeta], storage: ChunkStorage, chunker: SemanticChunker
) -> List[SavedChunk]:
    all_chunks: List[SavedChunk] = []
    for path, metadata in file_map.items():
        cached_chunks = storage.get_chunks_for_file(path, metadata["hash"])
        if cached_chunks:
            print("Chunks found in sqlite for file:", path)
            all_chunks.extend(cached_chunks)
        else:
            new_chunks = chunker.chunk(path, metadata["hash"])
            storage.store_chunks(path, metadata["hash"], new_chunks)
            all_chunks.extend(new_chunks)
    return all_chunks


if __name__ == "__main__":
    BASE_PATH = './sample'
    GRAPH_FILE = "symbolic_graph.pkl"
    model = GeminiLLM(model_name='gemini-2.5-flash', api_key='AIzaSyBiwNzJHmOa21DwkiB5vdhaI1UmdX33NX0')
    filemap = scan_files(BASE_PATH)
    chunk_store = ChunkStorage()
    chunker = SemanticChunker()
    chunks = get_all_chunks(filemap, chunk_store, chunker)
    embed_store = ChunkEmbeddingStore(chunks=chunks)
    print("Chunks: ",len(chunks))
    sym_builder = SymbolicGraphBuilder(graph_path=GRAPH_FILE)
    sym_builder.update_graph(chunks, filemap)
    print("Graph Stats:", sym_builder.get_graph_stats())
    sym_builder.save_graph()
    semantic_graph = EmergentSemanticGraphBuilder(graph_path="semantic_graph.pkl", min_cluster_size=4, embedding_store=embed_store)
    semantic_graph.update_graph(chunks, filemap)
    semantic_graph.save_graph()
    #semantic_graph.save_graph()
    print("Graph Stats:", semantic_graph.get_graph_stats())
    cluster_data = semantic_graph.get_clusters()
    print("len of chunks: ",len(chunks)," clusters: ",len(cluster_data))
    bm25 = BM25Retriever(saved_chunks=chunks)
    faiss = FaissRetriever(embedding_store=embed_store,saved_chunks=chunks)
    cluster = SemanticClusterRetriever(graph=semantic_graph.get_graph(), embedding_store=embed_store)
    symbolic = SymbolicRetriever(builder=sym_builder)
    retrievers = {
        "bm25": bm25,
        "faiss": faiss,
        "cluster": cluster,
        "symbolic": symbolic,
    }

    hyb = HybridRetriever(retrievers=retrievers)
    #model = GeminiModel(api_key="AIzaSyCP4cXNCWCOsuc---------------bt4M") #discarded api for test ues
    planner = Planner(llm=model, retriever=hyb)
    ans = planner.run("Tell me about Election process, and The Indian Evidence Act (1872) from inner_folder folder", top_k=5)
    print("Answer: ",ans)