-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathindex_for_qa.py
More file actions
71 lines (57 loc) · 2.41 KB
/
index_for_qa.py
File metadata and controls
71 lines (57 loc) · 2.41 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os
from pathlib import Path
from retriever.embeddings import CodeEmbeddingSystem
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def index_codebase(directory: str = "."):
"""Index all Python files in the directory for Q&A."""
# initialize the embedding system
embedding_system = CodeEmbeddingSystem()
# collect all Python files
path = Path(directory)
python_files = list(path.rglob("*.py"))
# filter out virtual environments and cache directories
python_files = [
f for f in python_files
if not any(part in str(f) for part in ['.venv', 'venv', '__pycache__', '.git'])
]
logger.info(f"Found {len(python_files)} Python files to index")
# read and chunk all files
all_chunks = []
for py_file in python_files:
try:
with open(py_file, 'r', encoding='utf-8') as f:
content = f.read()
# in production, you'd want more sophisticated chunking
lines = content.split('\n')
chunk = []
current_chunk = []
for line in lines:
if (line.startswith('def ') or line.startswith('class ')) and current_chunk:
# save previous chunk
chunk_text = '\n'.join(current_chunk)
if len(chunk_text.strip()) > 50: # Minimum chunk size
all_chunks.append(f"# File: {py_file}\n{chunk_text}")
current_chunk = [line]
else:
current_chunk.append(line)
# don't forget the last chunk
if current_chunk:
chunk_text = '\n'.join(current_chunk)
if len(chunk_text.strip()) > 50:
all_chunks.append(f"# File: {py_file}\n{chunk_text}")
except Exception as e:
logger.error(f"Error reading {py_file}: {e}")
logger.info(f"Created {len(all_chunks)} code chunks")
# index all chunks
if all_chunks:
embedding_system.index_code(all_chunks)
# save the index
embedding_system.save_index("code_index.pkl")
logger.info("Index saved to code_index.pkl")
logger.info("You can now use the Q&A interface!")
else:
logger.error("No code chunks to index!")
if __name__ == "__main__":
index_codebase(".")