Skip to content

Commit 40bb063

Browse files
committed
Detailed step by step explanation added with document ingestion, test_embeddings, test_rag_pipeline & test_search.py with Flask based UI to expore RAG
1 parent f0f9c02 commit 40bb063

16 files changed

+452
-0
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,3 +33,4 @@ build/
3333
.vscode/
3434
/.venv/
3535
/my_chroma_data/
36+
/python/chroma_db/
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
1) requirements.txt
2+
2) steps.txt
3+
3) chroma_client_setup_1.py
4+
4) ingest_documents.py
5+
5) test_embeddings.py
6+
6) test_rag_pipeline.py
7+
7) test_search.py
8+
8) app.py

python/large-rag/app.py

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
"""
2+
TechCorp AI Assistant - Interactive RAG Chat Interface
3+
"""
4+
5+
from flask import Flask, render_template, request, jsonify, Response, stream_with_context
6+
import os
7+
import sys
8+
from datetime import datetime
9+
import json
10+
import time
11+
12+
# Add core modules to path
13+
sys.path.append(os.path.join(os.path.dirname(__file__), 'core'))
14+
15+
from core.vector_engine import VectorEngine
16+
from core.chat_engine import ChatEngine
17+
from core.document_processor import DocumentProcessor
18+
19+
app = Flask(__name__)
20+
21+
# Initialize RAG components
22+
print("\n" + "="*60)
23+
print("🚀 Starting TechCorp AI Assistant")
24+
print("="*60)
25+
print("\n[INIT] Loading RAG components...")
26+
27+
vector_engine = VectorEngine()
28+
print("[INIT] Vector engine ready")
29+
30+
chat_engine = ChatEngine(vector_engine)
31+
print("[INIT] Chat engine ready")
32+
33+
doc_processor = DocumentProcessor(vector_engine)
34+
print("[INIT] Document processor ready")
35+
36+
@app.route('/')
37+
def index():
38+
"""Render the chat interface"""
39+
return render_template('index.html')
40+
@app.route('/chat', methods=['POST'])
41+
def chat():
42+
"""Handle chat messages"""
43+
try:
44+
data = request.json
45+
user_message = data.get('message', '')
46+
47+
if not user_message:
48+
return jsonify({'error': 'No message provided'}), 400
49+
50+
# Get response from RAG system
51+
response = chat_engine.get_response(user_message)
52+
53+
return jsonify({
54+
'response': response['answer'],
55+
'sources': response['sources'],
56+
'confidence': response['confidence'],
57+
'timestamp': datetime.now().isoformat()
58+
})
59+
60+
except Exception as e:
61+
return jsonify({'error': str(e)}), 500
62+
63+
@app.route('/api/chat/stream', methods=['POST'])
64+
def chat_stream():
65+
"""Handle chat messages with streaming response"""
66+
def generate():
67+
try:
68+
data = request.json
69+
user_message = data.get('message', '')
70+
71+
if not user_message:
72+
yield f"data: {json.dumps({'error': 'No message provided'})}\n\n"
73+
return
74+
75+
# Send initial event
76+
yield f"data: {json.dumps({'event': 'start'})}\n\n"
77+
78+
# Get response from RAG system
79+
response = chat_engine.get_response(user_message)
80+
81+
# Stream the response word by word
82+
words = response['answer'].split()
83+
for i, word in enumerate(words):
84+
time.sleep(0.05) # Small delay for streaming effect
85+
yield f"data: {json.dumps({'event': 'token', 'content': word + ' '})}\n\n"
86+
87+
# Send sources at the end
88+
yield f"data: {json.dumps({'event': 'sources', 'sources': response['sources'], 'confidence': response['confidence']})}\n\n"
89+
90+
# Send completion event
91+
yield f"data: {json.dumps({'event': 'done'})}\n\n"
92+
93+
except Exception as e:
94+
yield f"data: {json.dumps({'event': 'error', 'error': str(e)})}\n\n"
95+
96+
return Response(
97+
stream_with_context(generate()),
98+
mimetype='text/event-stream',
99+
headers={
100+
'Cache-Control': 'no-cache',
101+
'X-Accel-Buffering': 'no'
102+
}
103+
)
104+
105+
@app.route('/api/status', methods=['GET'])
106+
def status():
107+
"""Get system status"""
108+
try:
109+
stats = vector_engine.get_stats()
110+
return jsonify({
111+
'status': 'operational',
112+
'documents': stats['total_documents'],
113+
'chunks': stats['total_chunks'],
114+
'last_updated': stats['last_updated']
115+
})
116+
except Exception as e:
117+
return jsonify({
118+
'status': 'error',
119+
'message': str(e)
120+
}), 500
121+
122+
123+
if __name__ == '__main__':
124+
# Initialize database with documents on first run
125+
if not vector_engine.is_initialized():
126+
print("First run detected. Processing TechCorp documents...")
127+
doc_processor.process_all_documents()
128+
print("Document processing complete!")
129+
130+
# Run the app
131+
app.run(host='0.0.0.0', port=5252, debug=True)

python/large-rag/check_chunking.py

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
import os
2+
3+
print(" DOCUMENT CHUNKING ENGINE")
4+
print("="*40)
5+
6+
def chunk_text(text, size=500, overlap=100):
7+
"""Smart chunking with overlap for context preservation"""
8+
chunks = []
9+
start = 0
10+
11+
while start < len(text):
12+
end = min(start + size, len(text))
13+
chunk = text[start:end]
14+
chunks.append(chunk)
15+
16+
if end >= len(text):
17+
break
18+
19+
start += size - overlap
20+
21+
return chunks
22+
23+
# Process sample document
24+
with open('../simple-rag-explained/amazon_return_policy.txt', 'r') as file:
25+
sample_doc = file.read().replace('\n', '')
26+
27+
print(f" Original document: {len(sample_doc)} characters")
28+
print("-"*40)
29+
30+
chunks = chunk_text(sample_doc, size=500, overlap=100)
31+
32+
print(f" Created {len(chunks)} chunks")
33+
print("-"*40)
34+
35+
for i, chunk in enumerate(chunks, 1):
36+
print(f"\nChunk {i} ({len(chunk)} chars):")
37+
print(f"Preview: {chunk[:60]}...")
38+
39+
# Save verification
40+
with open('../chunk-test.txt', 'w') as f:
41+
f.write(f"CHUNKS:{len(chunks)}")
42+
43+
print("\n" + "="*40)
44+
print(" Chunking complete!")
45+
print(f" Stats: {len(chunks)} chunks from {len(sample_doc)} chars")
46+
print(" Ready for vectorization!")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
import chromadb
2+
from chromadb.config import Settings
3+
4+
print(" Initializing ChromaDB persistant Client...")
5+
client = chromadb.PersistentClient(
6+
path="../chroma_db",
7+
settings=Settings(anonymized_telemetry=False)
8+
)
9+
10+
collection = client.get_or_create_collection(
11+
name="return_policy_data",
12+
metadata={"hnsw:space": "cosine"}
13+
)
14+
15+
print(f" Data Created: {collection.name}")
16+
print(f" Memories: {collection.count()}")
17+
print(" AI Data Ready!")
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
import os
2+
import chromadb
3+
from sentence_transformers import SentenceTransformer
4+
from pathlib import Path
5+
6+
print("TECHCORP KNOWLEDGE INGESTION SYSTEM")
7+
print("="*50)
8+
9+
# Initialize systems
10+
print("Connecting to AI Brain (from Task 3)...")
11+
client = chromadb.PersistentClient(path="../chroma_db")
12+
collection = client.get_collection("techcorp_docs")
13+
14+
print("Loading Semantic Processor (from Task 5)...")
15+
model = SentenceTransformer('all-MiniLM-L6-v2')
16+
print("All systems online!\n")
17+
18+
# Process documents
19+
print("Beginning knowledge transfer...")
20+
doc_count = 0
21+
total_chunks = 0
22+
23+
for category in Path('/techcorp-docs').iterdir():
24+
if category.is_dir():
25+
print(f"\nProcessing {category.name}:")
26+
27+
for doc in category.glob('*.md'):
28+
print(f" {doc.name}", end="")
29+
30+
with open(doc, 'r') as f:
31+
content = f.read()
32+
33+
# Apply chunking strategy from Task 4!
34+
chunks = [content[i:i+500] for i in range(0, len(content), 400)]
35+
36+
for i, chunk in enumerate(chunks):
37+
doc_id = f"{doc.stem}_{i}"
38+
# Apply embedding from Task 5!
39+
embedding = model.encode(chunk).tolist()
40+
41+
# Store in database from Task 3!
42+
collection.add(
43+
ids=[doc_id],
44+
embeddings=[embedding],
45+
documents=[chunk],
46+
metadatas={"file": doc.name, "category": category.name}
47+
)
48+
total_chunks += 1
49+
50+
doc_count += 1
51+
print(f" ({len(chunks)} chunks)")
52+
53+
print("\n" + "="*50)
54+
print(f"INGESTION COMPLETE!")
55+
print(f"Statistics:")
56+
print(f" • Documents processed: {doc_count}")
57+
print(f" • Knowledge chunks: {total_chunks}")
58+
print(f" • AI IQ increased: +{doc_count*10} points")
59+
print(f"\nValue delivered: $500K in searchable knowledge!")
60+
61+
# Save results
62+
with open('ingest-complete.txt', 'w') as f:
63+
f.write(f"DOCS:{doc_count},CHUNKS:{collection.count()}")

python/large-rag/init_scripts.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
python3 -m venv venv && source venv/bin/activate
2+
pip install uv && uv pip install chromadb sentence-transformers openai flask
3+
echo "READY" > /rag-setup-complete.txt

python/large-rag/requirements.txt

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
chromadb==0.4.22
2+
sentence-transformers==2.3.1
3+
openai==1.12.0
4+
flask==3.0.0
5+
python-dotenv==1.0.0
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
from sentence_transformers import SentenceTransformer
2+
import numpy as np
3+
4+
print(" Loading Google's AI Brain (all-MiniLM-L6-v2)...")
5+
model = SentenceTransformer('all-MiniLM-L6-v2')
6+
print(" Brain loaded! 90M parameters ready!\n")
7+
8+
# TechCorp test sentences
9+
sentences = [
10+
"Dogs are allowed in the office on Fridays",
11+
"Pets can come to work on Furry Fridays",
12+
"Remote work policy allows 3 days from home"
13+
]
14+
15+
print(" Converting text to vectors...")
16+
embeddings = model.encode(sentences)
17+
print(f" Created {len(embeddings)} vectors of {len(embeddings[0])} dimensions each!\n")
18+
19+
# Calculate semantic similarities
20+
sim_1_2 = np.dot(embeddings[0], embeddings[1])
21+
sim_1_3 = np.dot(embeddings[0], embeddings[2])
22+
23+
print(" Semantic Similarity Analysis:")
24+
print("="*50)
25+
print(f"'Dogs allowed' ←→ 'Pets permitted'")
26+
print(f"Similarity: {sim_1_2:.3f} (Very Related! )\n")
27+
28+
print(f"'Dogs allowed' ←→ 'Remote work'")
29+
print(f"Similarity: {sim_1_3:.3f} (Not Related )\n")
30+
31+
# Visualization
32+
print(" Similarity Scale:")
33+
print("0.0 1.0")
34+
print(f" Remote {'' * int(sim_1_3*20)}")
35+
print(f" Pets {'' * int(sim_1_2*20)}")
36+
37+
# Save results
38+
with open('../embedding-test.txt', 'w') as f:
39+
f.write(f"SIM_PET:{sim_1_2:.3f},SIM_REMOTE:{sim_1_3:.3f}")

0 commit comments

Comments
 (0)