semantic-code-index/codebert_probing_example.py at main · DineshKuppan/semantic-code-index · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import time

import faiss
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline

local_dir = "codebert_model"

model_start_time = time.time()
pretrained_model_name = "microsoft/codebert-base-mlm"

if os.path.exists(local_dir) and os.listdir(local_dir):
    print(f'Looking for model in local directory & already available in path "{local_dir}"')
    print("Model will load from local directory.")
else:
    print('Downloading model from Hugging Face Hub...')


def get_embedding(code):
    inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()


# Initialize FAISS index and add code embeddings
def split_code_file(filepath, lines_per_chunk=20):
    with open(filepath) as f:
        lines = f.readlines()
    return [''.join(lines[i:i + lines_per_chunk]) for i in range(0, len(lines), lines_per_chunk)]


# Measure model loading time from local directory
start_time = time.time()
model = RobertaForMaskedLM.from_pretrained(pretrained_model_name, ignore_mismatched_sizes=True)
tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name)
end_time = time.time()

print(f"Model loading time from local/downloaded: {end_time - start_time:.2f} seconds")

index = faiss.IndexFlatL2(768)  # 768 for CodeBERT; adjust if using CodeLlama
code_snippets = ["def add(a, b): return a + b", "for i in range(10): print(i)"]
embeddings = np.array([get_embedding(code) for code in code_snippets])
index.add(embeddings)

query = "loop through numbers and print"
query_embedding = get_embedding(query).reshape(1, -1)

D, I = index.search(query_embedding, k=3)
print("Top results:")
for i in I[0]:
    print(code_snippets[i])

CODE = "if (x is not None) <mask> (x>1)"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)

outputs = fill_mask(CODE)
print(outputs)
model_end_time = time.time()
print(f"Model inference time: {model_end_time - model_start_time:.2f} seconds")