|
| 1 | +# -*- coding: utf-8 -*- |
| 2 | +""" |
| 3 | +[Martinez-Gil2024c] Improving Source Code Similarity Detection with GraphCodeBERT and Additional Feature Integration, arXiv preprint arXiv:xxxx.xxxxx, 2024 |
| 4 | +
|
| 5 | +@author: Jorge Martinez-Gil |
| 6 | +""" |
| 7 | + |
| 8 | +# Install the required transformers package with PyTorch support |
| 9 | +# The -U flag ensures that the package is updated to the latest version if not already installed. |
| 10 | +#!pip install transformers[torch] -U |
| 11 | + |
| 12 | +import torch |
| 13 | +import torch.nn as nn |
| 14 | +from torch.utils.data import Dataset |
| 15 | +from transformers import RobertaTokenizer, RobertaForSequenceClassification, AutoTokenizer, AutoModel, Trainer, TrainingArguments, EarlyStoppingCallback |
| 16 | +from transformers.modeling_outputs import SequenceClassifierOutput |
| 17 | +import json |
| 18 | +import random |
| 19 | +from sklearn.metrics import precision_recall_fscore_support, accuracy_score |
| 20 | + |
| 21 | +# Garbage collection to free up memory |
| 22 | +import gc |
| 23 | +torch.cuda.empty_cache() # Clear GPU memory |
| 24 | +gc.collect() # Collect any unused objects in CPU RAM |
| 25 | + |
| 26 | +# Custom model class inheriting from nn.Module |
| 27 | +class RobertaForSequenceClassificationWithOutput(nn.Module): |
| 28 | + def __init__(self, num_labels=2, output_feature_dim=1): |
| 29 | + """ |
| 30 | + Initialize the model with: |
| 31 | + - num_labels: Number of output labels/classes for classification |
| 32 | + - output_feature_dim: Dimension of the additional output feature |
| 33 | + """ |
| 34 | + super().__init__() |
| 35 | + self.num_labels = num_labels |
| 36 | + |
| 37 | + # Load the pre-trained GraphCodeBERT model |
| 38 | + self.roberta = AutoModel.from_pretrained('microsoft/graphcodebert-base') |
| 39 | + |
| 40 | + # Layer to process the additional output feature |
| 41 | + self.output_feature_layer = nn.Linear(output_feature_dim, self.roberta.config.hidden_size) |
| 42 | + |
| 43 | + # Classifier layer which concatenates the BERT output and additional feature |
| 44 | + self.classifier = nn.Linear(self.roberta.config.hidden_size + self.roberta.config.hidden_size, num_labels) |
| 45 | + |
| 46 | + # Dropout layer to avoid overfitting |
| 47 | + self.dropout = nn.Dropout(self.roberta.config.hidden_dropout_prob) |
| 48 | + |
| 49 | + def forward(self, input_ids, attention_mask=None, labels=None, output_feature=None): |
| 50 | + """ |
| 51 | + Forward pass of the model. |
| 52 | + """ |
| 53 | + # Get the outputs from the BERT model |
| 54 | + outputs = self.roberta(input_ids, attention_mask=attention_mask) |
| 55 | + pooled_output = outputs.pooler_output |
| 56 | + |
| 57 | + # Process the additional feature and concatenate with the BERT output |
| 58 | + output_feature_processed = self.output_feature_layer(output_feature.unsqueeze(-1)) |
| 59 | + combined_features = torch.cat((pooled_output, output_feature_processed), dim=1) |
| 60 | + combined_features = self.dropout(combined_features) |
| 61 | + |
| 62 | + # Pass through the classifier |
| 63 | + logits = self.classifier(combined_features) |
| 64 | + |
| 65 | + # Calculate the loss if labels are provided |
| 66 | + loss = None |
| 67 | + if labels is not None: |
| 68 | + loss_fct = nn.CrossEntropyLoss() |
| 69 | + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) |
| 70 | + |
| 71 | + # Return the output in the expected format |
| 72 | + return SequenceClassifierOutput(loss=loss, logits=logits) |
| 73 | + |
| 74 | +# Custom dataset class for code pairs |
| 75 | +class CodePairDataset(Dataset): |
| 76 | + def __init__(self, file_path, tokenizer): |
| 77 | + """ |
| 78 | + Initialize the dataset with: |
| 79 | + - file_path: Path to the JSON dataset file |
| 80 | + - tokenizer: Pre-trained tokenizer for encoding the data |
| 81 | + """ |
| 82 | + # Load the dataset from the file |
| 83 | + with open(file_path, 'r') as file: |
| 84 | + self.data = json.load(file) |
| 85 | + self.tokenizer = tokenizer |
| 86 | + |
| 87 | + def __getitem__(self, idx): |
| 88 | + """ |
| 89 | + Get an item (code pair) from the dataset. |
| 90 | + """ |
| 91 | + item = self.data[idx] |
| 92 | + |
| 93 | + # Tokenize the code pair |
| 94 | + encoding = self.tokenizer(text=item["code1"], text_pair=item["code2"], truncation=True, padding="max_length", max_length=512, return_tensors="pt") |
| 95 | + |
| 96 | + # Squeeze the batch dimension out of the encoding |
| 97 | + encoding = {key: val.squeeze(0) for key, val in encoding.items()} |
| 98 | + |
| 99 | + # Add the label and the additional output feature |
| 100 | + encoding['labels'] = torch.tensor(item["score"], dtype=torch.long) |
| 101 | + encoding['output_feature'] = torch.tensor(item["output"], dtype=torch.float) |
| 102 | + |
| 103 | + return encoding |
| 104 | + |
| 105 | + def __len__(self): |
| 106 | + """ |
| 107 | + Return the length of the dataset. |
| 108 | + """ |
| 109 | + return len(self.data) |
| 110 | + |
| 111 | +# Function to compute evaluation metrics during training |
| 112 | +def compute_metrics(p): |
| 113 | + """ |
| 114 | + Compute accuracy, precision, recall, and F1 score for the predictions. |
| 115 | + """ |
| 116 | + predictions, labels = p |
| 117 | + predictions = predictions.argmax(-1) |
| 118 | + precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='binary') |
| 119 | + return {'accuracy': accuracy_score(labels, predictions), 'f1': f1, 'precision': precision, 'recall': recall} |
| 120 | + |
| 121 | +def main(): |
| 122 | + """ |
| 123 | + Main function to run the training and evaluation. |
| 124 | + """ |
| 125 | + # Load the tokenizer |
| 126 | + tokenizer = AutoTokenizer.from_pretrained('microsoft/graphcodebert-base') |
| 127 | + |
| 128 | + # Path to the dataset file |
| 129 | + dataset_path = 'data\data2.json' # Your dataset path |
| 130 | + |
| 131 | + # Load the dataset |
| 132 | + full_dataset = CodePairDataset(file_path=dataset_path, tokenizer=tokenizer) |
| 133 | + |
| 134 | + # Split dataset into training, validation, and test sets |
| 135 | + train_size = int(0.8 * len(full_dataset)) |
| 136 | + test_val_size = len(full_dataset) - train_size |
| 137 | + val_size = int(0.5 * test_val_size) # Half of the remaining for validation |
| 138 | + test_size = test_val_size - val_size |
| 139 | + |
| 140 | + # Randomly split the dataset into train, validation, and test sets |
| 141 | + train_dataset, remaining_dataset = torch.utils.data.random_split(full_dataset, [train_size, test_val_size]) |
| 142 | + val_dataset, test_dataset = torch.utils.data.random_split(remaining_dataset, [val_size, test_size]) |
| 143 | + |
| 144 | + # Initialize the model |
| 145 | + model = RobertaForSequenceClassificationWithOutput(num_labels=2, output_feature_dim=1) |
| 146 | + |
| 147 | + # Set training arguments |
| 148 | + training_args = TrainingArguments( |
| 149 | + output_dir='./results', # Directory to save the model and results |
| 150 | + num_train_epochs=3, # Number of epochs to train |
| 151 | + per_device_train_batch_size=8, # Batch size per GPU/CPU |
| 152 | + warmup_steps=500, # Number of warmup steps for learning rate scheduler |
| 153 | + weight_decay=0.01, # Weight decay for regularization |
| 154 | + logging_dir='./logs', # Directory for storing logs |
| 155 | + evaluation_strategy="steps", # Evaluate the model every N steps |
| 156 | + eval_steps=500, # Steps interval for evaluation |
| 157 | + save_strategy="steps", # Save the model every N steps |
| 158 | + save_steps=500, # Steps interval for saving the model |
| 159 | + load_best_model_at_end=True, # Load the best model at the end of training |
| 160 | + metric_for_best_model="f1", # Metric to use for selecting the best model |
| 161 | + ) |
| 162 | + |
| 163 | + # Initialize the Trainer |
| 164 | + trainer = Trainer( |
| 165 | + model=model, |
| 166 | + args=training_args, |
| 167 | + train_dataset=train_dataset, |
| 168 | + eval_dataset=val_dataset, |
| 169 | + compute_metrics=compute_metrics, # Function to compute metrics |
| 170 | + callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # Stop early if no improvement |
| 171 | + ) |
| 172 | + |
| 173 | + # Train the model |
| 174 | + trainer.train() |
| 175 | + |
| 176 | + # Evaluate the model on the validation set |
| 177 | + val_results = trainer.evaluate(val_dataset) |
| 178 | + print(f"Validation Precision: {val_results['eval_precision']:.4f}") |
| 179 | + print(f"Validation Recall: {val_results['eval_recall']:.4f}") |
| 180 | + print(f"Validation F1 Score: {val_results['eval_f1']:.4f}") |
| 181 | + |
| 182 | + # Evaluate the model on the test set |
| 183 | + test_results = trainer.evaluate(test_dataset) |
| 184 | + print(f"Test Precision: {test_results['eval_precision']:.4f}") |
| 185 | + print(f"Test Recall: {test_results['eval_recall']:.4f}") |
| 186 | + print(f"Test F1 Score: {test_results['eval_f1']:.4f}") |
| 187 | + |
| 188 | +# Run the main function |
| 189 | +if __name__ == "__main__": |
| 190 | + main() |
0 commit comments