semantic-code-index/cli.py at main · DineshKuppan/semantic-code-index · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
"""
Command Line Interface for CodeBERT Indexer
"""

import argparse
import sys
import logging
import numpy as np
import torch
from pathlib import Path

from indexers.codebert_indexer import CodeBERTIndexer
from indexers.vespa_embedding_store import VespaEmbeddingStore


def main():
    """Main entry point for CLI"""
    parser = argparse.ArgumentParser(description='CodeBERT Code Indexing System')
    parser.add_argument('--scan', type=str, help='Path to codebase to scan')
    parser.add_argument('--index-dir', type=str, default='./code_index', help='Directory to save/load index')
    parser.add_argument('--search', type=str, help='Code query to search for')
    parser.add_argument('--top-k', type=int, default=5, help='Number of top results to return')
    parser.add_argument('--stats', action='store_true', help='Show codebase statistics')
    parser.add_argument('--vespa', action='store_true', help='Use Vespa.ai for storing embeddings')
    parser.add_argument('--vespa-endpoint', type=str, help='Vespa endpoint URL (if using existing Vespa instance)')

    args = parser.parse_args()

    indexer = CodeBERTIndexer()

    # Initialize Vespa store if requested
    vespa_store = None
    if args.vespa:
        vespa_store = VespaEmbeddingStore(
            embedding_dim=768  # CodeBERT embeddings dimension
        )
        # Either connect to existing endpoint or start a Docker container
        if args.vespa_endpoint:
            print(f"Connecting to Vespa endpoint at {args.vespa_endpoint}")
            vespa_store.connect_to_vespa(args.vespa_endpoint)
        else:
            print("Starting Vespa Docker container...")
            vespa_store.connect_to_docker()
            print("Vespa container started and connected")

    if args.scan:
        # Scan and index codebase
        code_files = indexer.scan_codebase(args.scan)
        embeddings = indexer.generate_embeddings(code_files)

        # Store in Vespa if requested
        if args.vespa and vespa_store:
            print("Storing embeddings in Vespa.ai...")
            count = vespa_store.store_embeddings(code_files, embeddings)
            print(f"Successfully stored {count} embeddings in Vespa")
          # Also save to files for backward compatibility
        indexer.save_index(code_files, embeddings, args.index_dir)

        if args.stats:
            stats = indexer.generate_statistics(code_files)
            print("\nCodebase Statistics:")
            print(f"Total files: {stats['total_files']}")
            print(f"Total lines: {stats['total_lines']}")
            print(f"Total functions: {stats['total_functions']}")
            print(f"Total classes: {stats['total_classes']}")
            print(f"Average file size: {stats['avg_file_size']:.2f} bytes")
            print("Languages:")
            for lang, count in stats['languages'].items():
                print(f"  {lang}: {count} files")

    elif args.search:
        # Search using Vespa if available, otherwise use file-based index
        if args.vespa and vespa_store:
            try:
                # Generate embedding for query
                inputs = indexer.tokenizer(
                    args.search,
                    padding=True,
                    truncation=True,
                    max_length=512,
                    return_tensors='pt'
                )

                inputs = {k: v.to(indexer.device) for k, v in inputs.items()}

                with torch.no_grad():
                    outputs = indexer.model(**inputs)
                    query_embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()

                # Search in Vespa
                results = vespa_store.search_similar_code(query_embedding, args.top_k)

                print(f"\nTop {args.top_k} similar code files (from Vespa.ai):")
                for result in results:
                    print(f"{result['score']:.4f}: {result['file_path']}")

            except Exception as e:
                print(f"Error searching with Vespa: {e}")
                print("Falling back to file-based search...")
                # Fall back to file-based search
                code_files, embeddings = indexer.load_index(args.index_dir)
                results = indexer.search_similar_code(args.search, embeddings, args.top_k)

                print(f"\nTop {args.top_k} similar code files:")
                for file_path, similarity in results:
                    print(f"{similarity:.4f}: {file_path}")
        else:
            # Use file-based index
            try:
                code_files, embeddings = indexer.load_index(args.index_dir)
                results = indexer.search_similar_code(args.search, embeddings, args.top_k)

                print(f"\nTop {args.top_k} similar code files:")
                for file_path, similarity in results:
                    print(f"{similarity:.4f}: {file_path}")
            except FileNotFoundError:
                print(f"Index not found at {args.index_dir}. Please scan a codebase first.")

    else:
        print("Please specify --scan to index a codebase or --search to search existing index")
        sys.exit(1)


if __name__ == "__main__":
    main()