semantic-code-index/main.py at main · DineshKuppan/semantic-code-index · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import os
import time

import torch
from transformers import AutoTokenizer, AutoModel

# Measure CUDA loading time
start_time = time.time()
is_cuda_enabled = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(is_cuda_enabled)
end_time = time.time()
print("Using device:", device)
print(f"CUDA loading time: {end_time - start_time:.2f} seconds")

# Specify the local directory to save the model
local_dir = "codebert_model"

pretrained_model_name = "microsoft/codebert-base"

if os.path.exists(local_dir) and os.listdir(local_dir):
    print(f'Looking for model in local directory & already available in path "{local_dir}"')
    print("Model will load from local directory.")
else:
    print('Downloading model from Hugging Face Hub...')

# Measure model loading time from local directory
start_time = time.time()
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name, cache_dir=local_dir)
model = AutoModel.from_pretrained(pretrained_model_name, cache_dir=local_dir)
end_time = time.time()
print(f"Model loading time from local/downloaded: {end_time - start_time:.2f} seconds")

# Tokenize the input text
nl_tokens = tokenizer.tokenize("return maximum value")
print(nl_tokens)
code_tokens=tokenizer.tokenize("def max(a,b): if a>b: return a else return b")
tokens=[tokenizer.cls_token]+nl_tokens+[tokenizer.sep_token]+code_tokens+[tokenizer.eos_token]
tokens_ids=tokenizer.convert_tokens_to_ids(tokens)
context_embeddings=model(torch.tensor(tokens_ids)[None,:])[0]
print("Context embeddings shape:", context_embeddings)