diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/filter.py b/api/filter.py new file mode 100644 index 0000000..7ed57f6 --- /dev/null +++ b/api/filter.py @@ -0,0 +1,64 @@ +import sys +import re +from unicodedata import category +from bs4 import BeautifulSoup +from markdown import markdown + + +USERNAME_REGEX = r"(\s|^)@(\S*\s?)" + +# Generate Unicode punctuation set +punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith(("P", "S"))} + +# Dictionary to count token replacements +counters = {} + +def remove_punctuation(text): + """ + Remove all punctuation characters from the given text. + + Args: + text (str): The input text. + + Returns: + str: The text without any punctuation. + """ + return "".join(char for char in text if char not in punctuation) + +def clean_text(text): + """ + Remove quoted text and large code blocks from GitHub issues or comments. + + This function performs the following clean-up: + - Removes quoted email/notification text from GitHub. + - Removes code blocks enclosed in triple backticks. + + Args: + text (str): The input text (typically from a GitHub issue or comment). + + Returns: + str: The cleaned text without quoted text or code blocks. + """ + # Remove quoted text from emails/notifications + text = re.sub(r"^(On[\s\S]*?notifications@github\.com\s*?wrote:\s*?)?(^(\>).*\s)*", '', text, flags=re.MULTILINE) + + # Remove code blocks enclosed in triple backticks + text = re.sub(r"```[a-z]*\n[\s\S]*?\n```", "", text) + + return text + +def remove_markdown_content(text): + """ + Converts Markdown content to plain text by removing all Markdown formatting. + + This function processes the input Markdown text and converts it to plain text + by removing all Markdown syntax. + + Args: + text (str): The input Markdown text. + + Returns: + str: Cleaned text without Markdown formatting. + """ + html = markdown(text) + return "".join(BeautifulSoup(html, "lxml").findAll(text=True)) \ No newline at end of file diff --git a/api/model.py b/api/model.py new file mode 100644 index 0000000..b847a5a --- /dev/null +++ b/api/model.py @@ -0,0 +1,479 @@ +import random +import os +import numpy as np +import pandas as pd +import torch +import time +import matplotlib.pyplot as plt +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from io import StringIO +from unicodedata import category +from markdown import markdown +from sklearn.model_selection import train_test_split +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report +from torch.utils.data import DataLoader, RandomSampler +from transformers import ( + BertTokenizer, BertForSequenceClassification, BertForMaskedLM, + XLNetTokenizer, XLNetForSequenceClassification, + RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM, + AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM, + get_scheduler +) +from torch.optim import AdamW + +MAX_LEN = 256 +BATCH_SIZE = 16 +LEARNING_RATE = 2e-5 +EPOCHS = 4 +WEIGHT_DECAY = 0.01 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert'] + +MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + +def train_model(train_df, model_save_path, model_task, model_tokenizer, model_name): + """ + Trains a sentiment classification model on the provided dataset. + + Args: + train_df (pd.DataFrame): DataFrame containing training data with 'text' and 'polarity' columns. + model_save_path (str): Path to save the best model. + model_task (class): Transformer model class, e.g., AutoModelForSequenceClassification + model_tokenizer (class): Tokenizer class, e.g., AutoTokenizer + model_name (str): Hugging Face model name, e.g., "bert-base-cased" + + Returns: + str: The path where the best model was saved. + """ + seed_torch(42) + + train_df['polarity'] = train_df['polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + + tokenizer = model_tokenizer.from_pretrained(model_name, do_lower_case=True) + + sentences = train_df.text.values + labels = train_df.polarity.values + + input_ids, attention_masks = [], [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + truncation=True, + return_attention_mask=True, + return_tensors='pt' + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42 + ) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42 + ) + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + # Use passed-in model_task and model_name + model = model_task.from_pretrained(model_name, num_labels=3) + model.to(device) + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def predict_model(predict_df, model_saved_path, model_task, model_tokenizer, model_name): + """ + Uses a pre-trained transformer model (BERT, XLNet, RoBERTa, etc.) + for sentiment classification and adds a polarity column in the test dataset + with predicted values. + + Args: + - predict_df (pd.DataFrame): DataFrame containing text data to classify. + - model_saved_path (str): Path to the saved model. + - model_task (class): e.g., BertForSequenceClassification, XLNetForSequenceClassification + - model_tokenizer (class): e.g., BertTokenizer, XLNetTokenizer + - model_name (str): e.g., "bert-base-cased" + + Returns: + pd.DataFrame: Same DataFrame with a new 'polarity' column containing predicted labels. + """ + + seed_torch(42) + + tokenizer = model_tokenizer.from_pretrained(model_name, do_lower_case=True) + + begin = time.time() + sentences = predict_df.text.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors='pt' + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids, dim=0) + prediction_masks = torch.cat(attention_masks, dim=0) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader( + prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE + ) + + # Instantiate model dynamically + model = model_task.from_pretrained(model_name, num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) + model.eval() + + predictions = [] + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + predictions.append(logits.detach().cpu().numpy()) + + end = time.time() + print(f'Prediction used {end - begin:.2f} seconds') + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + + # Add polarity column + predict_df['polarity'] = flat_predictions + + return predict_df + +""" +def train_model(train_df, model_save_path, model_select=0): + +Trains a sentiment classification model on the provided dataset. + +Args: +- train_df (pd.DataFrame) +- model_save_path (str) +- model_select (int, optional) + +Returns: + str: The path where the best model was saved. + +Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the models + + seed_torch(42) + + cur_model = MODELS[model_select] + m_name = MODEL_NAMES[model_select] + + + train_df['Polarity'] = train_df['Polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.Text.values + labels = train_df.Polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path +""" + +def seed_torch(seed): + """ +Set random seeds for reproducibility in PyTorch and related libraries. + +Args: +- Seed (int) : number to use for all random generators. + +Example: +seed_torch(42) + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic=True + +""" +def test_model(test_df, model_saved_path, model_select=0): + +Tests a pre-trained sentiment classification model on a test dataset and evaluates its performance. + +Args: +- test_df (pd.DataFrame) +- model_saved_path (str) +- model_select (int, optional) + +Returns: +pd.DataFrame: A DataFrame with the original test data and the model's predictions. + + MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model=MODELS[model_select] + m_name=MODEL_NAMES[model_select] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin=time.time() + + test_df['Polarity']=test_df['Polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0}) + + + sentences = test_df.Text.values + labels = test_df.Polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens = True, + max_length = MAX_LEN, + pad_to_max_length = True, + return_attention_mask = True, + return_tensors = 'pt', + ) + + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids,dim=0) + prediction_masks = torch.cat(attention_masks,dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) +# model.cuda() + model.eval() + + predictions,true_labels=[],[] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, b_labels = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + predictions.append(logits) + true_labels.append(label_ids) + + end=time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + flat_true_labels = [item for sublist in true_labels for item in sublist] + + print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) + + print(classification_report(flat_true_labels,flat_predictions)) + + + df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity']) + + df_combined = pd.concat([test_df, df_prediction], axis=1) + + counts = df_combined['prediction_Polarity'].value_counts() + print(counts) + + return df_combined +""" \ No newline at end of file diff --git a/api/tokenizer.py b/api/tokenizer.py new file mode 100644 index 0000000..c546182 --- /dev/null +++ b/api/tokenizer.py @@ -0,0 +1,99 @@ +import re +import warnings + +# Dictionary to count token replacements +counters = {} + + +def replace_token(regex, token_name, text): + """ + Replace matched patterns in the text with the specified token. + + This function uses regular expressions to find occurrences of the pattern + and replaces them with a token name. The number of replacements made is counted. + + Args: + regex (str): The regular expression pattern to match. + token_name (str): The replacement token name. + text (str): The input text. + + Returns: + tuple: A tuple containing: + - str: The text with the tokens replacing the matches. + - int: The number of replacements made. + """ + replaced_text, replacements = re.subn(regex, f" {token_name} ", text, flags=re.MULTILINE) + counters[token_name] = counters.get(token_name, 0) + replacements + return replaced_text, replacements + +def tokenize_text(text): + """ + Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc. + + This function processes the input text and replaces various elements, such as: + - Email addresses (replaced with 'MEMAIL'). + - GitHub mentions (replaced with 'MMENTION'). + - Code blocks (replaced with 'MICODE'). + - Version numbers (replaced with 'MVERSIONNUMBER'). + - Issue mentions (replaced with 'MISSUEMENTION'). + - URLs (replaced with 'MURL'). + + Args: + text (str): The input text. + + Returns: + tuple: A tuple containing: + - str: The tokenized text. + - int: The total number of replacements made. + """ + total_replacements = 0 + + text, replacements = replace_token(r"\S+@\S*\s?", "MEMAIL", text) + total_replacements += replacements + + text, replacements = replace_token(USERNAME_REGEX, "MMENTION", text) + total_replacements += replacements + + text, replacements = replace_token(r"`([^`]*)`", "MICODE", text) + total_replacements += replacements + + text, replacements = replace_token(r"\b\d+\.\d+(\.\d+)*\b", "MVERSIONNUMBER", text) + total_replacements += replacements + + text, replacements = replace_token(r"(\s|^)#\d+", "MISSUEMENTION", text) + total_replacements += replacements + + text, replacements = replace_token( + r"([a-zA-Z0-9]+):\/\/([\w_-]+(?:\.[\w_-]+)*)[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]", + "MURL", + text, + ) + total_replacements += replacements + + return text, total_replacements + +def transform_text(row): + """ + Transforms a row by cleaning and tokenizing its text content. + + This function extracts the "Text" key from the input dictionary and processes + it using the `tokenize_text` function. The text is also cleaned by removing + newline characters. + + Args: + row (dict): A dictionary containing a 'Text' key. + + Returns: + tuple: A tuple containing: + - str: The processed text after cleaning and tokenization. + - int: The number of replacements made. + """ + text = row.get("Text", "") + + if not isinstance(text, str): + warnings.warn(f"Converting non-string type to string: {type(text)}") + text = str(text) + + text, replaced_count = tokenize_text(text) + text = text.replace("\n", "") + return text, replaced_count \ No newline at end of file diff --git a/env.yml b/env.yml new file mode 100644 index 0000000..97b3bcf --- /dev/null +++ b/env.yml @@ -0,0 +1,21 @@ +name: sentiment_classifier +channels: + - conda-forge + - defaults +dependencies: + - python=3.13.3 + - ipykernel + - pip + - pip: + - bs4 + - torch + - scikit-learn + - seaborn + - tabulate + - markdown + - numpy + - pandas + - scikit-learn + - transformers + - ipywidgets +prefix: /opt/anaconda3/envs/sentiment_classifier diff --git a/exec/train_or_predict.py b/exec/train_or_predict.py new file mode 100644 index 0000000..e266ad7 --- /dev/null +++ b/exec/train_or_predict.py @@ -0,0 +1,45 @@ +import sys +import os +import argparse +import pandas as pd +from transformers import AutoTokenizer, AutoModelForSequenceClassification + +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) +from api.model import predict_model, train_model + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["train", "predict"], required=True) + parser.add_argument("--input", required=True) + parser.add_argument("--model_path", help="Path to trained model") + parser.add_argument("--output", required=True) + parser.add_argument("--model_name", required=True, help="Hugging Face model name, e.g., 'bert-base-cased'") + return parser.parse_args() + +def main(): + args = parse_args() + df = pd.read_csv(args.input) + + if args.mode == "train": + saved_model_path = train_model( + train_df=df, + model_save_path=args.output, + model_task=AutoModelForSequenceClassification, + model_tokenizer=AutoTokenizer, + model_name=args.model_name + ) + print(f"MODEL_SAVED_AT: {saved_model_path}") + + elif args.mode == "predict": + pred_df = predict_model( + predict_df=df, + model_saved_path=args.model_path, + model_task=AutoModelForSequenceClassification, + model_tokenizer=AutoTokenizer, + model_name=args.model_name + ) + pred_df.to_csv(args.output, index=False) + print(f"PREDICTION_SAVED_AT: {args.output}") + +if __name__ == "__main__": + main() diff --git a/notebooks/Test.ipynb b/notebooks/Test.ipynb new file mode 100644 index 0000000..13e77a4 --- /dev/null +++ b/notebooks/Test.ipynb @@ -0,0 +1,188 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Test\n", + "\n", + "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n", + "\n", + "## Evaluation Metrics \n", + "We will assess: \n", + "1. **Overall model performance** across all platforms. \n", + "2. **Platform-specific performance** for each model on: \n", + " - **GitHub** \n", + " - **Jira** \n", + " - **Mailbox** \n", + "\n", + "## Results \n", + "The evaluation will print: \n", + "- **Overall accuracy** of each model. \n", + "- **Performance breakdown per platform** for each model. \n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "\n", + "from api.model import *" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Test df_crossplatform on 4 models and 3 platforms (Table 3.2)\n", + "In this section, we evaluate the four trained **cross-platform sentiment classification models** on our **cross-platform sentiment dataset**.\n", + "\n", + "## Evaluation Metrics \n", + "We will assess: \n", + "1. **Overall model performance** across all platforms. \n", + "2. **Platform-specific performance** for each model on: \n", + " - **GitHub** \n", + " - **Jira** \n", + " - **Mailbox** \n", + "\n", + "## Results \n", + "The evaluation will print: \n", + "- **Overall accuracy** of each model. \n", + "- **Performance breakdown per platform** for each model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_directory = os.getcwd()\n", + "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n", + "\n", + "# Load test dataset\n", + "test_df = pd.read_csv(f'{root}/test_df.csv')\n", + "\n", + "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + "model_results = {}\n", + "\n", + "# Define platform mapping\n", + "platforms = {0: \"GitHub\", 1: \"Jira\", 2: \"Mailbox\"}\n", + "\n", + "# Evaluate each model\n", + "for i, model_name in enumerate(MODEL_NAMES):\n", + " model_path = f\"{root}/{model_name}_model\"\n", + " print(f\"Evaluating {model_name} model...for overall platform\")\n", + "\n", + " # Get overall accuracy\n", + " overall_accuracy = test_model(test_df, model_path, model_select=i)\n", + "\n", + " # Evaluate accuracy per platform\n", + " platform_accuracies = {}\n", + " for platform_id, platform_name in platforms.items():\n", + " test_df_platform = test_df[test_df[\"Platform\"] == platform_id]\n", + " if not test_df_platform.empty:\n", + " print(f\"Evaluating {model_name} model...for {platform_name} platform\")\n", + " accuracy = test_model(test_df_platform, model_path, model_select=i)\n", + " platform_accuracies[platform_name] = accuracy\n", + " else:\n", + " platform_accuracies[platform_name] = \"No data\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Generalization Performance of the Model (Table 3.3)\n", + "In this section, we evaluate the **Bert-CP** model's **generalization performance** on the existing datasets: \n", + "- **GitHub Golden Rule Dataset** \n", + "- **Stack Overflow Dataset** \n", + "\n", + "We will also compare the performance of the **BERT model** trained on **GitHub Golden Rule** and **Stack Overflow** datasets, with a focus on **cross-platform performance**. This comparison aims to validate the **superiority** of our model.\n", + "\n", + "## Evaluation Process \n", + "- **Bert-CP Model Evaluation**: We test the **Bert-CP** model on the **GitHub Golden Rule** and **Stack Overflow** datasets.\n", + "- **Cross-Platform Comparison**: We compare the performance of models trained on **GitHub Golden Rule** and **Stack Overflow** datasets across multiple platforms using the **BERT model**.\n", + "\n", + "## Goals \n", + "- To assess the **generalization** of the **Bert-CP** model across different datasets.\n", + "- To highlight the **superiority** of our cross-platform model over dataset-specific models." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Evaluate how bert trained on different datasets\n", + "\n", + "# Load test datasets\n", + "test_gh = pd.read_csv(f'{root}/test_gh.csv')\n", + "test_so = pd.read_csv(f'{root}/test_so.csv')\n", + "\n", + "# Define model paths\n", + "bert_model_path = f\"{root}/bert_model\"\n", + "gh_bert_model_path = f\"{root}/GH_bert_model\"\n", + "so_bert_model_path = f\"{root}/SO_bert_model\"\n", + "\n", + "# Store results\n", + "model_results = {}\n", + "\n", + "# 1. Validate bert_model on test_gh and test_so\n", + "print(\"Evaluating bert_model on GitHub test dataset...\")\n", + "bert_on_gh = test_model(test_gh, bert_model_path, model_select=0)\n", + "\n", + "print(\"Evaluating bert_model on Stack Overflow test dataset...\")\n", + "bert_on_so = test_model(test_so, bert_model_path, model_select=0)\n", + "\n", + "model_results[\"bert_model\"] = {\n", + " \"test_gh Accuracy\": bert_on_gh,\n", + " \"test_so Accuracy\": bert_on_so\n", + "}\n", + "\n", + "# 2. Validate GH_bert_model on test_so\n", + "print(\"Evaluating GH_bert_model on Stack Overflow test dataset...\")\n", + "gh_bert_on_so = test_model(test_so, gh_bert_model_path, model_select=0)\n", + "\n", + "model_results[\"GH_bert_model\"] = {\n", + " \"test_so Accuracy\": gh_bert_on_so\n", + "}\n", + "\n", + "# 3. Validate SO_bert_model on test_gh\n", + "print(\"Evaluating SO_bert_model on GitHub test dataset...\")\n", + "so_bert_on_gh = test_model(test_gh, so_bert_model_path, model_select=0)\n", + "\n", + "model_results[\"SO_bert_model\"] = {\n", + " \"test_gh Accuracy\": so_bert_on_gh\n", + "}" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sentiment_classifier", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/Train.ipynb b/notebooks/Train.ipynb new file mode 100644 index 0000000..c66c041 --- /dev/null +++ b/notebooks/Train.ipynb @@ -0,0 +1,148 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train\n", + "\n", + "We will train the following models: \n", + "- **BERT** \n", + "- **XLNet** \n", + "- **RoBERTa** \n", + "- **ALBERT** \n", + "\n", + "## Training Parameters \n", + "- **MAX_LEN**: `256` \n", + "- **BATCH_SIZE**: `16` \n", + "- **LEARNING_RATE**: `2e-5` \n", + "- **EPOCHS**: `4` \n", + "\n", + "Each model is trained using the merged dataset and saved for further evaluation." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "import pandas as pd\n", + "\n", + "from api.model import *\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "First we load and split the tokenized datasets into training and testing sets (70/30 split).\n", + "\n", + "### Datasets:\n", + "- **`crossplatform_sf_dataset_tokenized.csv`**: This is the main dataset used in this study.\n", + "- **`so-dataset_tokenized.csv`**: This dataset originates from the research paper *Sentiment Polarity Detection for Software Development*.\n", + "- **`gh-dataset_tokenized.csv`**: This dataset is derived from the research paper *GitHub Golden Rule* (*Can We Use SE-specific Sentiment Analysis Tools in a \n", + "\n", + "### Output:\n", + "- Training: `train_df.csv`, `train_gh.csv`, `train_so.csv`\n", + "- Testing: `test_df.csv`, `test_gh.csv`, `test_so.csv`" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "current_directory = os.getcwd()\n", + "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n", + "\n", + "# Sets up the csv and splits into 70 and 30 split\n", + "# Read datasets\n", + "current_directory = os.getcwd()\n", + "input_path = f'{root}/crossplatform_sf_dataset_tokenized.csv'\n", + "so_input_path = f'{root}/so-dataset_tokenized.csv'\n", + "gh_input_path = f'{root}/gh-dataset_tokenized.csv'\n", + "\n", + "# Load datasets into Pandas DataFrames\n", + "df_crossplatform = pd.read_csv(input_path)\n", + "df_so = pd.read_csv(so_input_path)\n", + "df_gh = pd.read_csv(gh_input_path)\n", + "\n", + "# Split `df_crossplatform` into training (70%) and testing (30%) sets\n", + "train_df, test_df = train_test_split(df_crossplatform, test_size=0.3, random_state=42)\n", + "\n", + "# Split GitHub and Stack Overflow datasets into training and testing sets (70% train, 30% test)\n", + "train_gh, test_gh = train_test_split(df_gh, test_size=0.3, random_state=42)\n", + "train_so, test_so = train_test_split(df_so, test_size=0.3, random_state=42)\n", + "\n", + "# Save all datasets to CSV files for further use\n", + "\n", + "train_df.to_csv(f'{root}/train_df.csv', index=False)\n", + "test_df.to_csv(f'{root}/test_df.csv', index=False)\n", + "train_gh.to_csv(f'{root}/train_gh.csv', index=False)\n", + "train_so.to_csv(f'{root}/train_so.csv', index=False)\n", + "test_gh.to_csv(f'{root}/test_gh.csv', index=False)\n", + "test_so.to_csv(f'{root}/test_so.csv', index=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "These splits are saved for consistent model training and evaluation. `train_df.csv`, `train_gh.csv`, and `train_so.csv` are merged into a dataset and saved for model training." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Combine `train_df`, `train_gh`, and `train_so` into the final training dataset\n", + "train_df_final = pd.concat([train_df, train_gh, train_so], axis=0, ignore_index=True)\n", + "train_df_final.to_csv(f'{root}/train_df_final.csv', index=False)\n", + "\n", + "# Define the list of model names to be trained\n", + "MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']\n", + "\n", + "# Train each model and save the trained model files\n", + "for i, model_name in enumerate(MODEL_NAMES):\n", + " model_save_path = f\"{root}/{model_name}_model\"\n", + " print(f\"Training {model_name} model...\")\n", + " train_model(train_df_final, model_save_path, model_select=i)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The models trained are saved to the root directory. We can see them in finder. At this point we can test the models we have created on the datasets, do this in the [Test.ipynb](./Test.ipynb) notebook." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "sentiment_classifier", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/tokenize_statistics.ipynb b/notebooks/tokenize_statistics.ipynb new file mode 100644 index 0000000..9264df8 --- /dev/null +++ b/notebooks/tokenize_statistics.ipynb @@ -0,0 +1,238 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Sentiment Analysis\n", + "\n", + "In this notebook, we test how well four different **machine learning models** can analyze and understand the **sentiment** (positive or negative emotions) expressed in messages on various platforms used by software developers. These platforms include:\n", + "\n", + "- **GitHub** (where developers collaborate on code)\n", + "- **Jira** (used for tracking issues and tasks)\n", + "- **Mailbox** (for email-based communication)\n", + "\n", + "The models we’re testing are:\n", + "- **BERT**\n", + "- **XLNet**\n", + "- **RoBERTa**\n", + "- **ALBERT**\n", + "\n", + "Each model was trained on a mix of data from all three platforms. We then test how well each model performs on new, unseen data from the same platforms.\n", + "\n", + "1. **Overall Accuracy**: How well each model performs across all platforms.\n", + "2. **Platform-Specific Accuracy**: How well each model performs on **GitHub**, **Jira**, and **Mailbox** separately.\n", + "\n", + "The results help us understand which models work best across different communication tools and give insights into how sentiment analysis can be applied to real-world developer conversations.\n", + "\n", + "Note: This notebook aims to get the data ready, the `Train.ipynb` notebook trains the models, the `Test.ipynb` tests the models against the datasets.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "oe8X-6s9btXo" + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "sys.path.append(os.path.abspath(\"..\"))\n", + "from tabulate import tabulate\n", + "\n", + "from api.filter import *\n", + "from api.tokenizer import *\n", + "from api.model import *" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RAXtSnSK4LPr" + }, + "source": [ + "# Tokenized\n", + "\n", + "This section processes the raw sentiment analysis datasets (`so-dataset.csv`, `gh-dataset.csv`, and `crossplatform_sf_dataset.csv`) by applying a custom text transformation function. The goal is to standardize and clean the text data before training. You can change the transform_text function to specific needs.\n", + "\n", + "There are aditional functions provided in [filter.py](../api/filter.py) and [tokenizer.py](../api/tokenizer.py) that can be used for specific use cases. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "current_directory = os.getcwd()\n", + "root = os.path.abspath(os.path.join(current_directory, \"..\", \"..\"))\n", + "\n", + "# Define input dataset paths\n", + "input_paths = [\n", + " f\"{root}/so-dataset.csv\",\n", + " f\"{root}/gh-dataset.csv\",\n", + " f\"{root}/crossplatform_sf_dataset.csv\"\n", + "]\n", + "\n", + "# Define the text transformation function (ensure transform_text is correctly implemented)\n", + "def transform_text(row):\n", + " # Modify this function according to your needs\n", + " # Example: return the original text and a dummy replacement count\n", + " return row[\"Text\"], 1\n", + "\n", + "# Loop through each dataset and process it\n", + "for input_path in input_paths:\n", + " # Generate output file name\n", + " output_filename = os.path.splitext(os.path.basename(input_path))[0] + \"_tokenized.csv\"\n", + " output_path = os.path.join(os.path.dirname(input_path), output_filename)\n", + "\n", + " # Load dataset\n", + " df = pd.read_csv(input_path)\n", + " print(f\"Processing dataset: {input_path}\")\n", + " print(df.head()) # Print first few rows for verification\n", + "\n", + " # Apply text transformation\n", + " df[[\"Text\", \"replaced_token\"]] = df.apply(transform_text, axis=1, result_type=\"expand\")\n", + "\n", + " # Calculate total replacements from `replaced_token` column\n", + " total_replacements = df[\"replaced_token\"].sum()\n", + "\n", + " # Save processed dataset\n", + " df.to_csv(output_path, header=True, index=False)\n", + "\n", + " print(f\"Tokenized dataset saved to: {output_path}\\n\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that the data is cleaned and tokenized lets look at the datasets we have. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "o3Cm2Gn_CbM5" + }, + "source": [ + "# Dataset Overview: \n", + "\n", + "`so-dataset.csv` : Contains Stack Overflow comment data.\n", + "\n", + "\n", + " `gh-dataset.csv` : Contains GitHub Stack overflow comment data. \n", + "\n", + "\n", + "\n", + "\n", + "`crossplatform_sf_dataset.csv`\n", + "\n", + "This dataset is designed for **Software Development Sentiment Classification**, containing user comments or discussions from different platforms with sentiment labels.\n", + "\n", + "## **Column Descriptions**\n", + "- **`Text`**: The user comment or discussion content. \n", + "- **`Polarity`**: Sentiment label indicating the emotional tendency of the text: \n", + " - `2`: Negative sentiment \n", + " - `0`: Neutral sentiment \n", + " - `1`: Positive sentiment \n", + "- **`Platform`**: The source platform of the data, indicating where the comment or discussion originated: \n", + " - `0`: **GitHub** (Discussions related to open-source projects, Issues, Pull Requests) \n", + " - `1`: **Jira** (Bug reports, task comments in software development management tools) \n", + " - `2`: **Mailbox** (Developer communication through emails) \n", + "\n", + "## **Dataset Distribution**\n", + "The dataset consists of data from **GitHub, Jira, and Mailbox**, with different sentiment (`Polarity`) distributions across platforms. It can be used to train and evaluate sentiment classification models to analyze developer emotions on different platforms. \n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "QaHSklrRClHU", + "outputId": "b3b3c0d8-b59e-41ab-ba80-5569d1d3650e" + }, + "outputs": [], + "source": [ + "\n", + "# Load dataset\n", + "input_path = f\"{root}/crossplatform_sf_dataset.csv\"\n", + "df = pd.read_csv(input_path)\n", + "\n", + "# Compute dataset statistics\n", + "total_samples = len(df)\n", + "polarity_counts = df[\"Polarity\"].value_counts().sort_index()\n", + "platform_counts = df[\"Platform\"].value_counts().sort_index()\n", + "\n", + "# Compute Polarity distribution within each Platform\n", + "platform_polarity_counts = df.groupby([\"Platform\", \"Polarity\"]).size().unstack().fillna(0)\n", + "\n", + "# Print results with formatting\n", + "print(\"=\" * 50)\n", + "print(f\"πŸ“Š Dataset Information: cf-dataset.csv\")\n", + "print(\"=\" * 50)\n", + "print(f\"Total Samples: {total_samples}\\n\")\n", + "\n", + "# Polarity distribution\n", + "print(\"πŸ“Œ Polarity Distribution:\")\n", + "print(tabulate(polarity_counts.reset_index(), headers=[\"Polarity\", \"Count\"], tablefmt=\"pretty\"))\n", + "print(\"\\n\")\n", + "\n", + "# Platform distribution\n", + "print(\"πŸ“Œ Platform Distribution:\")\n", + "print(tabulate(platform_counts.reset_index(), headers=[\"Platform\", \"Count\"], tablefmt=\"pretty\"))\n", + "print(\"\\n\")\n", + "\n", + "# Platform-wise Polarity distribution\n", + "print(\"πŸ“Œ Platform-wise Polarity Distribution:\")\n", + "print(tabulate(platform_polarity_counts, headers=\"keys\", tablefmt=\"pretty\"))\n", + "print(\"=\" * 50)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now that we have an understanding of the three datasets we can move over to the [Train.ipynb](./Train.ipynb) Notebook to start training the models based on the datasets we just prepared and reviewed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After the models are trained they can be tested with the [Test.ipynb](./Test.ipynb) Notebook. " + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "L4", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "sentiment_classifier", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.13.3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}