backend

rahul-aot · rahul-aot · commit 3a91c391bfa6 · 2025-12-13T13:08:53.000-08:00
diff --git a/backend/.gitignore b/backend/.gitignore
@@ -0,0 +1,63 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# Virtual environments
+.venv
+pip-log.txt
+pip-delete-this-directory.txt
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+
+# Environment variables
+.env
+.env.local
+.env.*.local
+
+# Database
+*.db
+*.sqlite
+*.sqlite3
+
+# Logs
+*.log
+logs/
+
+# Testing
+.pytest_cache/
+.coverage
+htmlcov/
+
+# OS
+.DS_Store
+Thumbs.db
+
+
+venv
diff --git a/backend/app/api/ask.py b/backend/app/api/ask.py
@@ -0,0 +1,80 @@
+from fastapi import APIRouter, Depends, HTTPException
+from pydantic import BaseModel
+import faiss
+import numpy as np
+import json
+
+from app.config import VECTOR_DIR
+from app.deps import get_session_id, get_gemini_key
+from app.utils.gemini import get_client, embed_texts, generate_answer
+
+router = APIRouter()
+
+
+class Question(BaseModel):
+    question: str
+
+
+@router.post("/ask")
+def ask(
+    payload: Question,
+    session_id: str = Depends(get_session_id),
+    api_key: str = Depends(get_gemini_key),
+):
+    # 1. Check vector store
+    session_vector = VECTOR_DIR / session_id
+    index_path = session_vector / "index.faiss"
+    chunks_path = session_vector / "chunks.json"
+
+    if not index_path.exists() or not chunks_path.exists():
+        raise HTTPException(
+            status_code=400,
+            detail="No document uploaded for this session"
+        )
+
+    # 2. Gemini client (per request)
+    client = get_client(api_key)
+
+    # 3. Load FAISS index
+    index = faiss.read_index(str(index_path))
+
+    # 4. Embed question
+    q_embedding = embed_texts([payload.question], client)[0]
+
+    # 5. Similarity search
+    _, indices = index.search(
+        np.array([q_embedding]).astype("float32"),
+        k=5
+    )
+
+    # 6. Load chunk metadata
+    with open(chunks_path, "r", encoding="utf-8") as f:
+        chunk_map = json.load(f)
+
+    # 7. Build real context from retrieved chunks
+    retrieved_chunks = []
+    for idx in indices[0]:
+        text = chunk_map.get(str(idx))
+        if text:
+            retrieved_chunks.append(text)
+
+    context = "\n\n".join(retrieved_chunks)
+
+    # 8. Prompt (strict RAG)
+    prompt = f"""
+You are answering strictly from the document context below.
+
+If the answer is not present in the context, respond with:
+"Not found in the document"
+
+Context:
+{context}
+
+Question:
+{payload.question}
+"""
+
+    # 9. Generate answer
+    answer = generate_answer(prompt, client)
+
+    return {"answer": answer}
diff --git a/backend/app/api/clear.py b/backend/app/api/clear.py
@@ -0,0 +1,12 @@
+from fastapi import APIRouter, Depends
+import shutil
+from app.config import UPLOAD_DIR, VECTOR_DIR
+from app.deps import get_session_id
+
+router = APIRouter()
+
+@router.delete("/clear")
+def clear_session(session_id: str = Depends(get_session_id)):
+    shutil.rmtree(UPLOAD_DIR / session_id, ignore_errors=True)
+    shutil.rmtree(VECTOR_DIR / session_id, ignore_errors=True)
+    return {"cleared": True}
diff --git a/backend/app/api/upload.py b/backend/app/api/upload.py
@@ -0,0 +1,70 @@
+from fastapi import APIRouter, UploadFile, Depends, HTTPException
+from pathlib import Path
+
+from app.config import UPLOAD_DIR, VECTOR_DIR
+from app.deps import get_session_id, get_gemini_key
+from app.utils.pdf_loader import extract_text_from_pdf
+from app.utils.text_splitter import split_text
+from app.utils.gemini import get_client, embed_texts
+from app.utils.vector_store import create_or_load_index, add_vectors, save_index
+import json
+
+
+router = APIRouter()
+
+
+@router.post("/upload")
+async def upload_file(
+    file: UploadFile,
+    session_id: str = Depends(get_session_id),
+    api_key: str = Depends(get_gemini_key),
+):
+    # 1. Validate file
+    if not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=415, detail="Only PDF supported")
+
+    # 2. Create Gemini client (per request)
+    client = get_client(api_key)
+
+    # 3. Save file
+    session_upload = UPLOAD_DIR / session_id
+    session_upload.mkdir(parents=True, exist_ok=True)
+
+    file_path = session_upload / file.filename
+    with open(file_path, "wb") as f:
+        f.write(await file.read())
+
+    # 4. Extract + split text
+    text = extract_text_from_pdf(str(file_path))
+    chunks = split_text(text)
+
+    if not chunks:
+        raise HTTPException(status_code=400, detail="No text found in PDF")
+
+    # 5. Generate embeddings
+    embeddings = embed_texts(chunks, client)
+
+    # 6. Create / append vector store
+    session_vector = VECTOR_DIR / session_id
+    session_vector.mkdir(parents=True, exist_ok=True)
+
+    index_path = session_vector / "index.faiss"
+    index = create_or_load_index(index_path, len(embeddings[0]))
+
+    add_vectors(index, embeddings)
+    save_index(index, index_path)
+
+        # ✅ SAVE CHUNK TEXT (REQUIRED FOR RAG)
+    chunk_map = {str(i): chunk for i, chunk in enumerate(chunks)}
+
+    with open(session_vector / "chunks.json", "w", encoding="utf-8") as f:
+        json.dump(chunk_map, f, ensure_ascii=False, indent=2)
+
+
+    return {
+        "status": "READY",
+        "filename": file.filename,
+        "chunks": len(chunks),
+    }
+
+
diff --git a/backend/app/api/validate.py b/backend/app/api/validate.py
@@ -0,0 +1,11 @@
+from fastapi import APIRouter, Depends
+from app.deps import get_gemini_key
+from app.utils.gemini import get_client, embed_texts
+
+router = APIRouter()
+
+@router.post("/validate-key")
+def validate_key(api_key: str = Depends(get_gemini_key)):
+    client = get_client(api_key)
+    embed_texts(["ping"], client)
+    return {"valid": True}
diff --git a/backend/app/config.py b/backend/app/config.py
@@ -0,0 +1,9 @@
+from pathlib import Path
+
+BASE_DIR = Path(__file__).resolve().parent.parent
+
+UPLOAD_DIR = BASE_DIR / "tmp" / "uploads"
+VECTOR_DIR = BASE_DIR / "vectorstore"
+
+UPLOAD_DIR.mkdir(parents=True, exist_ok=True)
+VECTOR_DIR.mkdir(parents=True, exist_ok=True)
diff --git a/backend/app/deps.py b/backend/app/deps.py
@@ -0,0 +1,9 @@
+from fastapi import Header, HTTPException
+
+def get_session_id(x_session_id: str = Header(...)):
+    return x_session_id
+
+def get_gemini_key(x_gemini_key: str = Header(None)):
+    if not x_gemini_key:
+        raise HTTPException(status_code=401, detail="Gemini API key required")
+    return x_gemini_key
diff --git a/backend/app/main.py b/backend/app/main.py
@@ -0,0 +1,9 @@
+from fastapi import FastAPI
+from app.api import validate, upload, ask, clear
+
+app = FastAPI(title="RAG Backend")
+
+app.include_router(validate.router, prefix="/api")
+app.include_router(upload.router, prefix="/api")
+app.include_router(ask.router, prefix="/api")
+app.include_router(clear.router, prefix="/api")
diff --git a/backend/app/utils/gemini.py b/backend/app/utils/gemini.py
@@ -0,0 +1,24 @@
+from google import genai
+
+
+def get_client(api_key: str):
+    return genai.Client(api_key=api_key)
+
+
+def embed_texts(texts: list[str], client):
+    embeddings = []
+    for text in texts:
+        result = client.models.embed_content(
+            model="models/text-embedding-004",
+            contents=text
+        )
+        embeddings.append(result.embeddings[0].values)
+    return embeddings
+
+
+def generate_answer(prompt: str, client):
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=prompt
+    )
+    return response.text
diff --git a/backend/app/utils/pdf_loader.py b/backend/app/utils/pdf_loader.py
@@ -0,0 +1,8 @@
+import fitz
+
+def extract_text_from_pdf(path: str) -> str:
+    doc = fitz.open(path)
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text
diff --git a/backend/app/utils/text_splitter.py b/backend/app/utils/text_splitter.py
@@ -0,0 +1,10 @@
+def split_text(text: str, chunk_size=500, overlap=100):
+    chunks = []
+    start = 0
+
+    while start < len(text):
+        end = start + chunk_size
+        chunks.append(text[start:end])
+        start = end - overlap
+
+    return chunks
diff --git a/backend/app/utils/vector_store.py b/backend/app/utils/vector_store.py
@@ -0,0 +1,20 @@
+import faiss
+import numpy as np
+from pathlib import Path
+
+def create_or_load_index(path: Path, dim: int):
+    if path.exists():
+        return faiss.read_index(str(path))
+    return faiss.IndexFlatL2(dim)
+
+def save_index(index, path: Path):
+    faiss.write_index(index, str(path))
+
+def add_vectors(index, embeddings: list[list[float]]):
+    vectors = np.array(embeddings).astype("float32")
+    index.add(vectors)
+
+def search(index, query_embedding, k=5):
+    q = np.array([query_embedding]).astype("float32")
+    _, indices = index.search(q, k)
+    return indices[0]
diff --git a/backend/requirements.txt b/backend/requirements.txt
@@ -0,0 +1,7 @@
+fastapi
+uvicorn
+python-multipart
+pymupdf
+faiss-cpu
+numpy
+google-genai
diff --git a/backend/tmp/uploads/0/Gen AI.pdf b/backend/tmp/uploads/0/Gen AI.pdf
diff --git a/backend/vectorstore/0/chunks.json b/backend/vectorstore/0/chunks.json
diff --git a/backend/vectorstore/0/index.faiss b/backend/vectorstore/0/index.faiss