Jayaprabahar
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎python/large-rag/ExecutionSteps.txt‎
Lines changed: 8 additions & 0 deletions b/‎python/large-rag/ExecutionSteps.txt‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/large-rag/app.py‎
Lines changed: 131 additions & 0 deletions b/‎python/large-rag/app.py‎
Lines changed: 131 additions & 0 deletions
diff --git a/‎python/large-rag/check_chunking.py‎
Lines changed: 46 additions & 0 deletions b/‎python/large-rag/check_chunking.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎python/large-rag/chroma_client_setup_1.py‎
Lines changed: 17 additions & 0 deletions b/‎python/large-rag/chroma_client_setup_1.py‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎python/large-rag/ingest_documents.py‎
Lines changed: 63 additions & 0 deletions b/‎python/large-rag/ingest_documents.py‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎python/large-rag/init_scripts.txt‎
Lines changed: 3 additions & 0 deletions b/‎python/large-rag/init_scripts.txt‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎python/large-rag/requirements.txt‎
Lines changed: 5 additions & 0 deletions b/‎python/large-rag/requirements.txt‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/large-rag/test_embeddings.py‎
Lines changed: 39 additions & 0 deletions b/‎python/large-rag/test_embeddings.py‎
Lines changed: 39 additions & 0 deletions
@@ -33,3 +33,4 @@ build/
 .vscode/
 /.venv/
 /my_chroma_data/
+/python/chroma_db/
@@ -0,0 +1,8 @@
+1) requirements.txt
+2) steps.txt
+3) chroma_client_setup_1.py
+4) ingest_documents.py
+5) test_embeddings.py
+6) test_rag_pipeline.py
+7) test_search.py
+8) app.py
@@ -0,0 +1,131 @@
+"""
+TechCorp AI Assistant - Interactive RAG Chat Interface
+"""
+
+from flask import Flask, render_template, request, jsonify, Response, stream_with_context
+import os
+import sys
+from datetime import datetime
+import json
+import time
+
+# Add core modules to path
+sys.path.append(os.path.join(os.path.dirname(__file__), 'core'))
+
+from core.vector_engine import VectorEngine
+from core.chat_engine import ChatEngine
+from core.document_processor import DocumentProcessor
+
+app = Flask(__name__)
+
+# Initialize RAG components
+print("\n" + "="*60)
+print("🚀  Starting TechCorp AI Assistant")
+print("="*60)
+print("\n[INIT] Loading RAG components...")
+
+vector_engine = VectorEngine()
+print("[INIT] Vector engine ready")
+
+chat_engine = ChatEngine(vector_engine)
+print("[INIT] Chat engine ready")
+
+doc_processor = DocumentProcessor(vector_engine)
+print("[INIT] Document processor ready")
+
+@app.route('/')
+def index():
+    """Render the chat interface"""
+    return render_template('index.html')
+@app.route('/chat', methods=['POST'])
+def chat():
+    """Handle chat messages"""
+    try:
+        data = request.json
+        user_message = data.get('message', '')
+
+        if not user_message:
+            return jsonify({'error': 'No message provided'}), 400
+
+        # Get response from RAG system
+        response = chat_engine.get_response(user_message)
+
+        return jsonify({
+            'response': response['answer'],
+            'sources': response['sources'],
+            'confidence': response['confidence'],
+            'timestamp': datetime.now().isoformat()
+        })
+
+    except Exception as e:
+        return jsonify({'error': str(e)}), 500
+
+@app.route('/api/chat/stream', methods=['POST'])
+def chat_stream():
+    """Handle chat messages with streaming response"""
+    def generate():
+        try:
+            data = request.json
+            user_message = data.get('message', '')
+
+            if not user_message:
+                yield f"data: {json.dumps({'error': 'No message provided'})}\n\n"
+                return
+
+            # Send initial event
+            yield f"data: {json.dumps({'event': 'start'})}\n\n"
+
+            # Get response from RAG system
+            response = chat_engine.get_response(user_message)
+
+            # Stream the response word by word
+            words = response['answer'].split()
+            for i, word in enumerate(words):
+                time.sleep(0.05)  # Small delay for streaming effect
+                yield f"data: {json.dumps({'event': 'token', 'content': word + ' '})}\n\n"
+
+            # Send sources at the end
+            yield f"data: {json.dumps({'event': 'sources', 'sources': response['sources'], 'confidence': response['confidence']})}\n\n"
+
+            # Send completion event
+            yield f"data: {json.dumps({'event': 'done'})}\n\n"
+
+        except Exception as e:
+            yield f"data: {json.dumps({'event': 'error', 'error': str(e)})}\n\n"
+
+    return Response(
+        stream_with_context(generate()),
+        mimetype='text/event-stream',
+        headers={
+            'Cache-Control': 'no-cache',
+            'X-Accel-Buffering': 'no'
+        }
+    )
+
+@app.route('/api/status', methods=['GET'])
+def status():
+    """Get system status"""
+    try:
+        stats = vector_engine.get_stats()
+        return jsonify({
+            'status': 'operational',
+            'documents': stats['total_documents'],
+            'chunks': stats['total_chunks'],
+            'last_updated': stats['last_updated']
+        })
+    except Exception as e:
+        return jsonify({
+            'status': 'error',
+            'message': str(e)
+        }), 500
+
+
+if __name__ == '__main__':
+    # Initialize database with documents on first run
+    if not vector_engine.is_initialized():
+        print("First run detected. Processing TechCorp documents...")
+        doc_processor.process_all_documents()
+        print("Document processing complete!")
+
+    # Run the app
+    app.run(host='0.0.0.0', port=5252, debug=True)
@@ -0,0 +1,46 @@
+import os
+
+print(" DOCUMENT CHUNKING ENGINE")
+print("="*40)
+
+def chunk_text(text, size=500, overlap=100):
+    """Smart chunking with overlap for context preservation"""
+    chunks = []
+    start = 0
+
+    while start < len(text):
+        end = min(start + size, len(text))
+        chunk = text[start:end]
+        chunks.append(chunk)
+
+        if end >= len(text):
+            break
+
+        start += size - overlap
+
+    return chunks
+
+# Process sample document
+with open('../simple-rag-explained/amazon_return_policy.txt', 'r') as file:
+    sample_doc = file.read().replace('\n', '')
+
+print(f" Original document: {len(sample_doc)} characters")
+print("-"*40)
+
+chunks = chunk_text(sample_doc, size=500, overlap=100)
+
+print(f" Created {len(chunks)} chunks")
+print("-"*40)
+
+for i, chunk in enumerate(chunks, 1):
+    print(f"\nChunk {i} ({len(chunk)} chars):")
+    print(f"Preview: {chunk[:60]}...")
+
+# Save verification
+with open('../chunk-test.txt', 'w') as f:
+    f.write(f"CHUNKS:{len(chunks)}")
+
+print("\n" + "="*40)
+print(" Chunking complete!")
+print(f" Stats: {len(chunks)} chunks from {len(sample_doc)} chars")
+print(" Ready for vectorization!")
@@ -0,0 +1,17 @@
+import chromadb
+from chromadb.config import Settings
+
+print(" Initializing ChromaDB persistant Client...")
+client = chromadb.PersistentClient(
+    path="../chroma_db",
+    settings=Settings(anonymized_telemetry=False)
+)
+
+collection = client.get_or_create_collection(
+    name="return_policy_data",
+    metadata={"hnsw:space": "cosine"}
+)
+
+print(f" Data Created: {collection.name}")
+print(f" Memories: {collection.count()}")
+print(" AI Data Ready!")
@@ -0,0 +1,63 @@
+import os
+import chromadb
+from sentence_transformers import SentenceTransformer
+from pathlib import Path
+
+print("TECHCORP KNOWLEDGE INGESTION SYSTEM")
+print("="*50)
+
+# Initialize systems
+print("Connecting to AI Brain (from Task 3)...")
+client = chromadb.PersistentClient(path="../chroma_db")
+collection = client.get_collection("techcorp_docs")
+
+print("Loading Semantic Processor (from Task 5)...")
+model = SentenceTransformer('all-MiniLM-L6-v2')
+print("All systems online!\n")
+
+# Process documents
+print("Beginning knowledge transfer...")
+doc_count = 0
+total_chunks = 0
+
+for category in Path('/techcorp-docs').iterdir():
+    if category.is_dir():
+        print(f"\nProcessing {category.name}:")
+
+        for doc in category.glob('*.md'):
+            print(f"  {doc.name}", end="")
+
+            with open(doc, 'r') as f:
+                content = f.read()
+
+            # Apply chunking strategy from Task 4!
+            chunks = [content[i:i+500] for i in range(0, len(content), 400)]
+
+            for i, chunk in enumerate(chunks):
+                doc_id = f"{doc.stem}_{i}"
+                # Apply embedding from Task 5!
+                embedding = model.encode(chunk).tolist()
+
+                # Store in database from Task 3!
+                collection.add(
+                    ids=[doc_id],
+                    embeddings=[embedding],
+                    documents=[chunk],
+                    metadatas={"file": doc.name, "category": category.name}
+                )
+                total_chunks += 1
+
+            doc_count += 1
+            print(f" ({len(chunks)} chunks)")
+
+print("\n" + "="*50)
+print(f"INGESTION COMPLETE!")
+print(f"Statistics:")
+print(f"   • Documents processed: {doc_count}")
+print(f"   • Knowledge chunks: {total_chunks}")
+print(f"   • AI IQ increased: +{doc_count*10} points")
+print(f"\nValue delivered: $500K in searchable knowledge!")
+
+# Save results
+with open('ingest-complete.txt', 'w') as f:
+    f.write(f"DOCS:{doc_count},CHUNKS:{collection.count()}")
@@ -0,0 +1,3 @@
+python3 -m venv venv && source venv/bin/activate
+pip install uv && uv pip install chromadb sentence-transformers openai flask
+echo "READY" > /rag-setup-complete.txt
@@ -0,0 +1,5 @@
+chromadb==0.4.22
+sentence-transformers==2.3.1
+openai==1.12.0
+flask==3.0.0
+python-dotenv==1.0.0
@@ -0,0 +1,39 @@
+from sentence_transformers import SentenceTransformer
+import numpy as np
+
+print(" Loading Google's AI Brain (all-MiniLM-L6-v2)...")
+model = SentenceTransformer('all-MiniLM-L6-v2')
+print(" Brain loaded! 90M parameters ready!\n")
+
+# TechCorp test sentences
+sentences = [
+    "Dogs are allowed in the office on Fridays",
+    "Pets can come to work on Furry Fridays",
+    "Remote work policy allows 3 days from home"
+]
+
+print(" Converting text to vectors...")
+embeddings = model.encode(sentences)
+print(f" Created {len(embeddings)} vectors of {len(embeddings[0])} dimensions each!\n")
+
+# Calculate semantic similarities
+sim_1_2 = np.dot(embeddings[0], embeddings[1])
+sim_1_3 = np.dot(embeddings[0], embeddings[2])
+
+print(" Semantic Similarity Analysis:")
+print("="*50)
+print(f"'Dogs allowed' ←→ 'Pets permitted'")
+print(f"Similarity: {sim_1_2:.3f} (Very Related! )\n")
+
+print(f"'Dogs allowed' ←→ 'Remote work'")
+print(f"Similarity: {sim_1_3:.3f} (Not Related )\n")
+
+# Visualization
+print(" Similarity Scale:")
+print("0.0  1.0")
+print(f"     Remote {'' * int(sim_1_3*20)}")
+print(f"     Pets   {'' * int(sim_1_2*20)}")
+
+# Save results
+with open('../embedding-test.txt', 'w') as f:
+    f.write(f"SIM_PET:{sim_1_2:.3f},SIM_REMOTE:{sim_1_3:.3f}")
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+python3 -m venv venv && source venv/bin/activate`
	`2`	`+pip install uv && uv pip install chromadb sentence-transformers openai flask`
	`3`	`+echo "READY" > /rag-setup-complete.txt`