-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcli.py
More file actions
125 lines (104 loc) · 5.19 KB
/
cli.py
File metadata and controls
125 lines (104 loc) · 5.19 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Command Line Interface for CodeBERT Indexer
"""
import argparse
import sys
import logging
import numpy as np
import torch
from pathlib import Path
from indexers.codebert_indexer import CodeBERTIndexer
from indexers.vespa_embedding_store import VespaEmbeddingStore
def main():
"""Main entry point for CLI"""
parser = argparse.ArgumentParser(description='CodeBERT Code Indexing System')
parser.add_argument('--scan', type=str, help='Path to codebase to scan')
parser.add_argument('--index-dir', type=str, default='./code_index', help='Directory to save/load index')
parser.add_argument('--search', type=str, help='Code query to search for')
parser.add_argument('--top-k', type=int, default=5, help='Number of top results to return')
parser.add_argument('--stats', action='store_true', help='Show codebase statistics')
parser.add_argument('--vespa', action='store_true', help='Use Vespa.ai for storing embeddings')
parser.add_argument('--vespa-endpoint', type=str, help='Vespa endpoint URL (if using existing Vespa instance)')
args = parser.parse_args()
indexer = CodeBERTIndexer()
# Initialize Vespa store if requested
vespa_store = None
if args.vespa:
vespa_store = VespaEmbeddingStore(
embedding_dim=768 # CodeBERT embeddings dimension
)
# Either connect to existing endpoint or start a Docker container
if args.vespa_endpoint:
print(f"Connecting to Vespa endpoint at {args.vespa_endpoint}")
vespa_store.connect_to_vespa(args.vespa_endpoint)
else:
print("Starting Vespa Docker container...")
vespa_store.connect_to_docker()
print("Vespa container started and connected")
if args.scan:
# Scan and index codebase
code_files = indexer.scan_codebase(args.scan)
embeddings = indexer.generate_embeddings(code_files)
# Store in Vespa if requested
if args.vespa and vespa_store:
print("Storing embeddings in Vespa.ai...")
count = vespa_store.store_embeddings(code_files, embeddings)
print(f"Successfully stored {count} embeddings in Vespa")
# Also save to files for backward compatibility
indexer.save_index(code_files, embeddings, args.index_dir)
if args.stats:
stats = indexer.generate_statistics(code_files)
print("\nCodebase Statistics:")
print(f"Total files: {stats['total_files']}")
print(f"Total lines: {stats['total_lines']}")
print(f"Total functions: {stats['total_functions']}")
print(f"Total classes: {stats['total_classes']}")
print(f"Average file size: {stats['avg_file_size']:.2f} bytes")
print("Languages:")
for lang, count in stats['languages'].items():
print(f" {lang}: {count} files")
elif args.search:
# Search using Vespa if available, otherwise use file-based index
if args.vespa and vespa_store:
try:
# Generate embedding for query
inputs = indexer.tokenizer(
args.search,
padding=True,
truncation=True,
max_length=512,
return_tensors='pt'
)
inputs = {k: v.to(indexer.device) for k, v in inputs.items()}
with torch.no_grad():
outputs = indexer.model(**inputs)
query_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()
# Search in Vespa
results = vespa_store.search_similar_code(query_embedding, args.top_k)
print(f"\nTop {args.top_k} similar code files (from Vespa.ai):")
for result in results:
print(f"{result['score']:.4f}: {result['file_path']}")
except Exception as e:
print(f"Error searching with Vespa: {e}")
print("Falling back to file-based search...")
# Fall back to file-based search
code_files, embeddings = indexer.load_index(args.index_dir)
results = indexer.search_similar_code(args.search, embeddings, args.top_k)
print(f"\nTop {args.top_k} similar code files:")
for file_path, similarity in results:
print(f"{similarity:.4f}: {file_path}")
else:
# Use file-based index
try:
code_files, embeddings = indexer.load_index(args.index_dir)
results = indexer.search_similar_code(args.search, embeddings, args.top_k)
print(f"\nTop {args.top_k} similar code files:")
for file_path, similarity in results:
print(f"{similarity:.4f}: {file_path}")
except FileNotFoundError:
print(f"Index not found at {args.index_dir}. Please scan a codebase first.")
else:
print("Please specify --scan to index a codebase or --search to search existing index")
sys.exit(1)
if __name__ == "__main__":
main()