Skip to content

Commit 2dbf7c5

Browse files
First release
1 parent f63ca31 commit 2dbf7c5

File tree

10 files changed

+6534
-2
lines changed

10 files changed

+6534
-2
lines changed

README.md

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@ In this project, we extend the capabilities of GraphCodeBERT—a transformer mod
1212

1313
### Repository Contents
1414

15-
- `graphcodebert_fint.ipynb`: Jupyter Notebook that includes the full implementation of the model, from data loading and preprocessing to training, evaluation, and results interpretation. Detailed comments and documentation are provided within the notebook.
15+
- `graphcodebert_fint.ipynb`: Jupyter Notebook that includes the full implementation of the model, from data loading and preprocessing to training, evaluation, and results interpretation. Detailed comments and documentation are provided within the notebook. It is optimized to be used in Google Colab since the use of a GPU is highly recommended.
16+
- `fine-tunning-graphcodebert-karnalim-with-features.py`: The source code in the form of a standard python app.
1617

1718

1819
## 🛠️ Methodology
@@ -24,7 +25,7 @@ The model is an extension of GraphCodeBERT, which is a transformer-based model p
2425
We utilize the IR-Plag dataset, which is specifically designed for benchmarking source code similarity detection techniques, particularly in academic plagiarism contexts. The dataset contains 467 code files, with 355 labeled as plagiarized. The diversity in coding styles and structures within this dataset makes it ideal for evaluating the effectiveness of our model.
2526

2627
### Training and Evaluation
27-
The model was trained using PyTorch and the Hugging Face Transformers library. The training process included random splits of the dataset into training, validation, and test sets. Key metrics such as precision, recall, and f-measure were computed to evaluate the model's performance. The notebook documents the training arguments, including batch size, number of epochs, and learning rate adjustments.
28+
The training process included random splits of the dataset into training, validation, and test sets. Key metrics such as precision, recall, and f-measure were computed to evaluate the model's performance. The notebook documents the training arguments, including batch size, number of epochs, and learning rate adjustments.
2829

2930

3031
## 📈 Results

data/data2.json

Lines changed: 3222 additions & 0 deletions
Large diffs are not rendered by default.

data/test.json

Lines changed: 416 additions & 0 deletions
Large diffs are not rendered by default.

data/training.json

Lines changed: 1934 additions & 0 deletions
Large diffs are not rendered by default.

data/validation.json

Lines changed: 416 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
# -*- coding: utf-8 -*-
2+
"""
3+
[Martinez-Gil2024c] Improving Source Code Similarity Detection with GraphCodeBERT and Additional Feature Integration, arXiv preprint arXiv:xxxx.xxxxx, 2024
4+
5+
@author: Jorge Martinez-Gil
6+
"""
7+
8+
# Install the required transformers package with PyTorch support
9+
# The -U flag ensures that the package is updated to the latest version if not already installed.
10+
#!pip install transformers[torch] -U
11+
12+
import torch
13+
import torch.nn as nn
14+
from torch.utils.data import Dataset
15+
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer, AutoModel, Trainer, TrainingArguments, EarlyStoppingCallback
16+
from transformers.modeling_outputs import SequenceClassifierOutput
17+
import json
18+
import random
19+
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
20+
21+
# Garbage collection to free up memory
22+
import gc
23+
torch.cuda.empty_cache() # Clear GPU memory
24+
gc.collect() # Collect any unused objects in CPU RAM
25+
26+
# Custom model class inheriting from nn.Module
27+
class RobertaForSequenceClassificationWithOutput(nn.Module):
28+
def __init__(self, num_labels=2, output_feature_dim=1):
29+
"""
30+
Initialize the model with:
31+
- num_labels: Number of output labels/classes for classification
32+
- output_feature_dim: Dimension of the additional output feature
33+
"""
34+
super().__init__()
35+
self.num_labels = num_labels
36+
37+
# Load the pre-trained GraphCodeBERT model
38+
self.roberta = AutoModel.from_pretrained('microsoft/graphcodebert-base')
39+
40+
# Layer to process the additional output feature
41+
self.output_feature_layer = nn.Linear(output_feature_dim, self.roberta.config.hidden_size)
42+
43+
# Classifier layer which concatenates the BERT output and additional feature
44+
self.classifier = nn.Linear(self.roberta.config.hidden_size + self.roberta.config.hidden_size, num_labels)
45+
46+
# Dropout layer to avoid overfitting
47+
self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob)
48+
49+
def forward(self, input_ids, attention_mask=None, labels=None, output_feature=None):
50+
"""
51+
Forward pass of the model.
52+
"""
53+
# Get the outputs from the BERT model
54+
outputs = self.roberta(input_ids, attention_mask=attention_mask)
55+
pooled_output = outputs.pooler_output
56+
57+
# Process the additional feature and concatenate with the BERT output
58+
output_feature_processed = self.output_feature_layer(output_feature.unsqueeze(-1))
59+
combined_features = torch.cat((pooled_output, output_feature_processed), dim=1)
60+
combined_features = self.dropout(combined_features)
61+
62+
# Pass through the classifier
63+
logits = self.classifier(combined_features)
64+
65+
# Calculate the loss if labels are provided
66+
loss = None
67+
if labels is not None:
68+
loss_fct = nn.CrossEntropyLoss()
69+
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
70+
71+
# Return the output in the expected format
72+
return SequenceClassifierOutput(loss=loss, logits=logits)
73+
74+
# Custom dataset class for code pairs
75+
class CodePairDataset(Dataset):
76+
def __init__(self, file_path, tokenizer):
77+
"""
78+
Initialize the dataset with:
79+
- file_path: Path to the JSON dataset file
80+
- tokenizer: Pre-trained tokenizer for encoding the data
81+
"""
82+
# Load the dataset from the file
83+
with open(file_path, 'r') as file:
84+
self.data = json.load(file)
85+
self.tokenizer = tokenizer
86+
87+
def __getitem__(self, idx):
88+
"""
89+
Get an item (code pair) from the dataset.
90+
"""
91+
item = self.data[idx]
92+
93+
# Tokenize the code pair
94+
encoding = self.tokenizer(text=item["code1"], text_pair=item["code2"], truncation=True, padding="max_length", max_length=512, return_tensors="pt")
95+
96+
# Squeeze the batch dimension out of the encoding
97+
encoding = {key: val.squeeze(0) for key, val in encoding.items()}
98+
99+
# Add the label and the additional output feature
100+
encoding['labels'] = torch.tensor(item["score"], dtype=torch.long)
101+
encoding['output_feature'] = torch.tensor(item["output"], dtype=torch.float)
102+
103+
return encoding
104+
105+
def __len__(self):
106+
"""
107+
Return the length of the dataset.
108+
"""
109+
return len(self.data)
110+
111+
# Function to compute evaluation metrics during training
112+
def compute_metrics(p):
113+
"""
114+
Compute accuracy, precision, recall, and F1 score for the predictions.
115+
"""
116+
predictions, labels = p
117+
predictions = predictions.argmax(-1)
118+
precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary')
119+
return {'accuracy': accuracy_score(labels, predictions), 'f1': f1, 'precision': precision, 'recall': recall}
120+
121+
def main():
122+
"""
123+
Main function to run the training and evaluation.
124+
"""
125+
# Load the tokenizer
126+
tokenizer = AutoTokenizer.from_pretrained('microsoft/graphcodebert-base')
127+
128+
# Path to the dataset file
129+
dataset_path = 'data\data2.json' # Your dataset path
130+
131+
# Load the dataset
132+
full_dataset = CodePairDataset(file_path=dataset_path, tokenizer=tokenizer)
133+
134+
# Split dataset into training, validation, and test sets
135+
train_size = int(0.8 * len(full_dataset))
136+
test_val_size = len(full_dataset) - train_size
137+
val_size = int(0.5 * test_val_size) # Half of the remaining for validation
138+
test_size = test_val_size - val_size
139+
140+
# Randomly split the dataset into train, validation, and test sets
141+
train_dataset, remaining_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_val_size])
142+
val_dataset, test_dataset = torch.utils.data.random_split(remaining_dataset, [val_size, test_size])
143+
144+
# Initialize the model
145+
model = RobertaForSequenceClassificationWithOutput(num_labels=2, output_feature_dim=1)
146+
147+
# Set training arguments
148+
training_args = TrainingArguments(
149+
output_dir='./results', # Directory to save the model and results
150+
num_train_epochs=3, # Number of epochs to train
151+
per_device_train_batch_size=8, # Batch size per GPU/CPU
152+
warmup_steps=500, # Number of warmup steps for learning rate scheduler
153+
weight_decay=0.01, # Weight decay for regularization
154+
logging_dir='./logs', # Directory for storing logs
155+
evaluation_strategy="steps", # Evaluate the model every N steps
156+
eval_steps=500, # Steps interval for evaluation
157+
save_strategy="steps", # Save the model every N steps
158+
save_steps=500, # Steps interval for saving the model
159+
load_best_model_at_end=True, # Load the best model at the end of training
160+
metric_for_best_model="f1", # Metric to use for selecting the best model
161+
)
162+
163+
# Initialize the Trainer
164+
trainer = Trainer(
165+
model=model,
166+
args=training_args,
167+
train_dataset=train_dataset,
168+
eval_dataset=val_dataset,
169+
compute_metrics=compute_metrics, # Function to compute metrics
170+
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # Stop early if no improvement
171+
)
172+
173+
# Train the model
174+
trainer.train()
175+
176+
# Evaluate the model on the validation set
177+
val_results = trainer.evaluate(val_dataset)
178+
print(f"Validation Precision: {val_results['eval_precision']:.4f}")
179+
print(f"Validation Recall: {val_results['eval_recall']:.4f}")
180+
print(f"Validation F1 Score: {val_results['eval_f1']:.4f}")
181+
182+
# Evaluate the model on the test set
183+
test_results = trainer.evaluate(test_dataset)
184+
print(f"Test Precision: {test_results['eval_precision']:.4f}")
185+
print(f"Test Recall: {test_results['eval_recall']:.4f}")
186+
print(f"Test F1 Score: {test_results['eval_f1']:.4f}")
187+
188+
# Run the main function
189+
if __name__ == "__main__":
190+
main()

0 commit comments

Comments
 (0)