-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcodebert_probing_example.py
More file actions
62 lines (46 loc) · 2.04 KB
/
codebert_probing_example.py
File metadata and controls
62 lines (46 loc) · 2.04 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import os
import time
import faiss
import numpy as np
import torch
from transformers import RobertaTokenizer, RobertaForMaskedLM, pipeline
local_dir = "codebert_model"
model_start_time = time.time()
pretrained_model_name = "microsoft/codebert-base-mlm"
if os.path.exists(local_dir) and os.listdir(local_dir):
print(f'Looking for model in local directory & already available in path "{local_dir}"')
print("Model will load from local directory.")
else:
print('Downloading model from Hugging Face Hub...')
def get_embedding(code):
inputs = tokenizer(code, return_tensors="pt", truncation=True, max_length=512)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
# Initialize FAISS index and add code embeddings
def split_code_file(filepath, lines_per_chunk=20):
with open(filepath) as f:
lines = f.readlines()
return [''.join(lines[i:i + lines_per_chunk]) for i in range(0, len(lines), lines_per_chunk)]
# Measure model loading time from local directory
start_time = time.time()
model = RobertaForMaskedLM.from_pretrained(pretrained_model_name, ignore_mismatched_sizes=True)
tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name)
end_time = time.time()
print(f"Model loading time from local/downloaded: {end_time - start_time:.2f} seconds")
index = faiss.IndexFlatL2(768) # 768 for CodeBERT; adjust if using CodeLlama
code_snippets = ["def add(a, b): return a + b", "for i in range(10): print(i)"]
embeddings = np.array([get_embedding(code) for code in code_snippets])
index.add(embeddings)
query = "loop through numbers and print"
query_embedding = get_embedding(query).reshape(1, -1)
D, I = index.search(query_embedding, k=3)
print("Top results:")
for i in I[0]:
print(code_snippets[i])
CODE = "if (x is not None) <mask> (x>1)"
fill_mask = pipeline('fill-mask', model=model, tokenizer=tokenizer)
outputs = fill_mask(CODE)
print(outputs)
model_end_time = time.time()
print(f"Model inference time: {model_end_time - model_start_time:.2f} seconds")