diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/model.py b/api/model.py new file mode 100644 index 0000000..ba49c34 --- /dev/null +++ b/api/model.py @@ -0,0 +1,1176 @@ +import random +import os +import numpy as np +import pandas as pd +import torch +import time +import matplotlib.pyplot as plt +from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler +from io import StringIO +from unicodedata import category +from markdown import markdown +from sklearn.model_selection import train_test_split +from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report +from torch.utils.data import DataLoader, RandomSampler +from transformers import ( + BertTokenizer, BertForSequenceClassification, BertForMaskedLM, + XLNetTokenizer, XLNetForSequenceClassification, + RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM, + AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM, + get_scheduler +) +from torch.optim import AdamW + +MAX_LEN = 256 +BATCH_SIZE = 16 +LEARNING_RATE = 2e-5 +EPOCHS = 4 +WEIGHT_DECAY = 0.01 +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +MODEL_NAMES = ['bert', 'xlnet', 'roberta', 'albert'] + +MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + +def train_model_bert(train_df, model_save_path): + + """ + Trains a Bert-based sentiment classification model on the provided dataset. + + Args: + - train_df (pd.DataFrame): DataFrame containing training data with 'text' and 'polarity' columns. + - model_save_path (str): Path to save the best model. + + Returns: + - str: The path where the best model was saved. + + Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the model with the highest F1 score on the validation set. + """ + + seed_torch(42) + + cur_model = MODELS[0] + m_name = MODEL_NAMES[0] + + + train_df['polarity'] = train_df['polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.text.values + labels = train_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def train_model_xlnet(train_df, model_save_path): + + """ + Trains an XLNet-based sentiment classification model on the provided dataset. + + Args: + - train_df (pd.DataFrame): DataFrame containing training data with 'text' and 'polarity' columns. + - model_save_path (str): Path to save the best model. + + Returns: + - str: The path where the best model was saved. + + Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the model with the highest F1 score on the validation set. + """ + + seed_torch(42) + + cur_model = MODELS[1] + m_name = MODEL_NAMES[1] + + + train_df['polarity'] = train_df['polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.text.values + labels = train_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def train_model_roberta(train_df, model_save_path): + + """ + Trains a Roberta-based sentiment classification model on the provided dataset. + + Args: + - train_df (pd.DataFrame): DataFrame containing training data with 'text' and 'polarity' columns. + - model_save_path (str): Path to save the best model. + + Returns: + - str: The path where the best model was saved. + + Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the model with the highest F1 score on the validation set. + """ + + seed_torch(42) + + cur_model = MODELS[2] + m_name = MODEL_NAMES[2] + + + train_df['polarity'] = train_df['polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.text.values + labels = train_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def train_model_albert(train_df, model_save_path): + + """ + Trains an Albert-based sentiment classification model on the provided dataset. + + Args: + - train_df (pd.DataFrame): DataFrame containing training data with 'text' and 'polarity' columns. + - model_save_path (str): Path to save the best model. + + Returns: + - str: The path where the best model was saved. + + Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the model with the highest F1 score on the validation set. + """ + + seed_torch(42) + + cur_model = MODELS[3] + m_name = MODEL_NAMES[3] + + + train_df['polarity'] = train_df['polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.text.values + labels = train_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def train_model(train_df, model_save_path, model_select=0): + """ +Trains a sentiment classification model on the provided dataset. + +Args: +- train_df (pd.DataFrame) +- model_save_path (str) +- model_select (int, optional) + +Returns: + str: The path where the best model was saved. + +Notes: + - Converts sentiment labels to numeric form (positive=1, negative=2, neutral=0). + - Saves the models + """ + seed_torch(42) + + cur_model = MODELS[model_select] + m_name = MODEL_NAMES[model_select] + + + train_df['polarity'] = train_df['polarity'].replace({'positive': 1, 'negative': 2, 'neutral': 0}) + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + sentences = train_df.text.values + labels = train_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding='max_length', + return_attention_mask=True, + return_tensors='pt', + truncation=True + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + input_ids = torch.cat(input_ids, dim=0) + attention_masks = torch.cat(attention_masks, dim=0) + labels = torch.tensor(labels) + + print(f'Training data shape: {input_ids.shape}, {attention_masks.shape}, {labels.shape}') + + + train_inputs, val_inputs, train_labels, val_labels = train_test_split( + input_ids, labels, test_size=0.1, random_state=42) + train_masks, val_masks, _, _ = train_test_split( + attention_masks, labels, test_size=0.1, random_state=42) + + + train_data = TensorDataset(train_inputs, train_masks, train_labels) + train_sampler = RandomSampler(train_data) + train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=BATCH_SIZE) + + val_data = TensorDataset(val_inputs, val_masks, val_labels) + val_sampler = SequentialSampler(val_data) + val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=BATCH_SIZE) + + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.to(device) + + + optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY) + + + num_training_steps = EPOCHS * len(train_dataloader) + lr_scheduler = get_scheduler( + name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps + ) + + + print("Starting training...") + best_f1 = 0 + for epoch in range(EPOCHS): + model.train() + total_loss = 0 + predictions, true_labels = [], [] + + for batch in train_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + optimizer.zero_grad() + outputs = model(b_input_ids, attention_mask=b_input_mask, labels=b_labels) + loss, logits = outputs[:2] + loss.backward() + optimizer.step() + lr_scheduler.step() + + total_loss += loss.item() + predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + true_labels.extend(b_labels.cpu().numpy()) + + train_acc = accuracy_score(true_labels, predictions) + print(f"Epoch {epoch+1}: Train Loss: {total_loss / len(train_dataloader):.4f}, Accuracy: {train_acc:.4f}") + + + model.eval() + val_predictions, val_labels = [], [] + with torch.no_grad(): + for batch in val_dataloader: + b_input_ids, b_input_mask, b_labels = [t.to(device) for t in batch] + outputs = model(b_input_ids, attention_mask=b_input_mask) + logits = outputs[0] + val_predictions.extend(torch.argmax(logits, axis=1).cpu().numpy()) + val_labels.extend(b_labels.cpu().numpy()) + + val_acc = accuracy_score(val_labels, val_predictions) + val_f1 = f1_score(val_labels, val_predictions, average='weighted') + print(f"Validation Accuracy: {val_acc:.4f}, F1 Score: {val_f1:.4f}") + + + if val_f1 > best_f1: + best_f1 = val_f1 + torch.save(model.state_dict(), model_save_path) + print(f"Best model saved at {model_save_path}") + + + print("Final Model Performance on Validation Set:") + print(classification_report(val_labels, val_predictions, digits=4)) + return model_save_path + +def seed_torch(seed): + """ +Set random seeds for reproducibility in PyTorch and related libraries. + +Args: +- Seed (int) : number to use for all random generators. + +Example: +seed_torch(42) + """ + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed(seed) + torch.backends.cudnn.deterministic=True + +def test_model(test_df, model_saved_path, model_select=0): + """ +Tests a pre-trained sentiment classification model on a test dataset and evaluates its performance. + +Args: +- test_df (pd.DataFrame) +- model_saved_path (str) +- model_select (int, optional) + +Returns: +pd.DataFrame: A DataFrame with the original test data and the model's predictions. + """ + + MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model=MODELS[model_select] + m_name=MODEL_NAMES[model_select] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin=time.time() + + test_df['polarity']=test_df['polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0}) + + + sentences = test_df.text.values + labels = test_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens = True, + max_length = MAX_LEN, + pad_to_max_length = True, + return_attention_mask = True, + return_tensors = 'pt', + ) + + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids,dim=0) + prediction_masks = torch.cat(attention_masks,dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) +# model.cuda() + model.eval() + + predictions,true_labels=[],[] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, b_labels = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + label_ids = b_labels.to('cpu').numpy() + + predictions.append(logits) + true_labels.append(label_ids) + + end=time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + flat_true_labels = [item for sublist in true_labels for item in sublist] + + print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions))) + + print(classification_report(flat_true_labels,flat_predictions)) + + + df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_polarity']) + + df_combined = pd.concat([test_df, df_prediction], axis=1) + + counts = df_combined['prediction_polarity'].value_counts() + print(counts) + + return df_combined + +def predict_bert(test_df, model_saved_path): + + """ + Runs inference with a pre-trained BERT sentiment classification model and updates the + polarity column in the test dataset with predicted values. + + Args: + - test_df (pd.DataFrame): DataFrame containing text and original polarity labels. + - model_saved_path (str): Path to the saved BERT model. + + Returns: + pd.DataFrame: The same DataFrame with polarity replaced by BERT predictions. + """ + + MODELS = [ + (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model = MODELS[0] + m_name = MODEL_NAMES[0] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin = time.time() + + # Convert string labels to numbers only if needed + test_df['polarity'] = test_df['polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0 + }) + + sentences = test_df.text.values + labels = test_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors='pt', + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids, dim=0) + prediction_masks = torch.cat(attention_masks, dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader( + prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE + ) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) + model.eval() + + predictions = [] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, _ = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + predictions.append(logits) + + end = time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + + # Overwrite polarity column with predicted values + test_df['polarity'] = flat_predictions + + return test_df + +def predict_xlnet(test_df, model_saved_path): + + """ + Runs inference with a pre-trained XLNet sentiment classification model and updates the + polarity column in the test dataset with predicted values. + + Args: + - test_df (pd.DataFrame): DataFrame containing text and original polarity labels. + - model_saved_path (str): Path to the saved XLNet model. + + Returns: + pd.DataFrame: The same DataFrame with polarity replaced by XLNet predictions. + """ + + MODELS = [ + (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model = MODELS[1] + m_name = MODEL_NAMES[1] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin = time.time() + + # Convert string labels to numbers only if needed + test_df['polarity'] = test_df['polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0 + }) + + sentences = test_df.text.values + labels = test_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors='pt', + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids, dim=0) + prediction_masks = torch.cat(attention_masks, dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader( + prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE + ) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) + model.eval() + + predictions = [] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, _ = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + predictions.append(logits) + + end = time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + + # Overwrite polarity column with predicted values + test_df['polarity'] = flat_predictions + + return test_df + +def predict_roberta(test_df, model_saved_path): + + """ + Runs inference with a pre-trained RoBERTa sentiment classification model and updates the + polarity column in the test dataset with predicted values. + + Args: + - test_df (pd.DataFrame): DataFrame containing text and original polarity labels. + - model_saved_path (str): Path to the saved RoBERTa model. + + Returns: + pd.DataFrame: The same DataFrame with polarity replaced by RoBERTa predictions. + """ + + MODELS = [ + (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model = MODELS[2] + m_name = MODEL_NAMES[2] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin = time.time() + + # Convert string labels to numbers only if needed + test_df['polarity'] = test_df['polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0 + }) + + sentences = test_df.text.values + labels = test_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors='pt', + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids, dim=0) + prediction_masks = torch.cat(attention_masks, dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader( + prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE + ) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) + model.eval() + + predictions = [] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, _ = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + predictions.append(logits) + + end = time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + + # Overwrite polarity column with predicted values + test_df['polarity'] = flat_predictions + + return test_df + +def predict_albert(test_df, model_saved_path): + + """ + Runs inference with a pre-trained ALBERT sentiment classification model and updates the + polarity column in the test dataset with predicted values. + + Args: + - test_df (pd.DataFrame): DataFrame containing text and original polarity labels. + - model_saved_path (str): Path to the saved ALBERT model. + + Returns: + pd.DataFrame: The same DataFrame with polarity replaced by ALBERT predictions. + """ + + MODELS = [ + (BertForSequenceClassification, BertTokenizer, 'bert-base-cased'), + (XLNetForSequenceClassification, XLNetTokenizer, 'xlnet-base-cased'), + (RobertaForSequenceClassification, RobertaTokenizer, 'roberta-base'), + (AlbertForSequenceClassification, AlbertTokenizer, 'albert-base-v1') + ] + MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert'] + seed_torch(42) + + cur_model = MODELS[3] + m_name = MODEL_NAMES[3] + + tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True) + + begin = time.time() + + # Convert string labels to numbers only if needed + test_df['polarity'] = test_df['polarity'].replace({ + 'positive':1, + 'negative':2, + 'neutral':0 + }) + + sentences = test_df.text.values + labels = test_df.polarity.values + + input_ids = [] + attention_masks = [] + + for sent in sentences: + encoded_dict = tokenizer.encode_plus( + str(sent), + add_special_tokens=True, + max_length=MAX_LEN, + padding="max_length", + truncation=True, + return_attention_mask=True, + return_tensors='pt', + ) + input_ids.append(encoded_dict['input_ids']) + attention_masks.append(encoded_dict['attention_mask']) + + prediction_inputs = torch.cat(input_ids, dim=0) + prediction_masks = torch.cat(attention_masks, dim=0) + prediction_labels = torch.tensor(labels) + + prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels) + prediction_sampler = SequentialSampler(prediction_data) + prediction_dataloader = DataLoader( + prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE + ) + + model = cur_model[0].from_pretrained(cur_model[2], num_labels=3) + model.load_state_dict(torch.load(model_saved_path)) + model.eval() + + predictions = [] + + for batch in prediction_dataloader: + batch = tuple(t.to(device) for t in batch) + b_input_ids, b_input_mask, _ = batch + + with torch.no_grad(): + outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask) + logits = outputs[0] + + logits = logits.detach().cpu().numpy() + predictions.append(logits) + + end = time.time() + print('Prediction used {:.2f} seconds'.format(end - begin)) + + flat_predictions = [item for sublist in predictions for item in sublist] + flat_predictions = np.argmax(flat_predictions, axis=1).flatten() + + # Overwrite polarity column with predicted values + test_df['polarity'] = flat_predictions + + return test_df diff --git a/exec/train_or_predict.py b/exec/train_or_predict.py new file mode 100644 index 0000000..3b385e5 --- /dev/null +++ b/exec/train_or_predict.py @@ -0,0 +1,73 @@ +import os +import sys +import argparse +import pandas as pd + +# To import functions from api.model +sys.path.append(os.path.join(os.path.dirname(__file__), "..")) + +from api.model import ( + train_model_bert, + train_model_xlnet, + train_model_roberta, + train_model_albert, + predict_bert, + predict_xlnet, + predict_roberta, + predict_albert +) + +TRAINERS = { + "bert": train_model_bert, + "xlnet": train_model_xlnet, + "roberta": train_model_roberta, + "albert": train_model_albert, +} + +PREDICTORS = { + "bert": predict_bert, + "xlnet": predict_xlnet, + "roberta": predict_roberta, + "albert": predict_albert, +} + +def parse_args(): + parser = argparse.ArgumentParser() + parser.add_argument("--mode", choices=["train", "predict"], required=True) + parser.add_argument("--input", required=True, help="CSV file for training or prediction") + parser.add_argument("--model", required=True, help="bert/xlnet/roberta/albert") + parser.add_argument("--model_path", help="Path to trained model") + parser.add_argument("--output", required=True, help="Where to save output") + + return parser.parse_args() + +def main(): + args = parse_args() + + # Validate model name early + if args.mode == "train" and args.model not in TRAINERS: + raise ValueError(f"Invalid model '{args.model}'. Choose from: {list(TRAINERS.keys())}") + + if args.mode == "predict" and args.model not in PREDICTORS: + raise ValueError(f"Invalid model '{args.model}'. Choose from: {list(PREDICTORS.keys())}") + + df = pd.read_csv(args.input) + + if args.mode == "train": + trainer = TRAINERS[args.model] + + result = trainer(df, args.output) + print(f"MODEL_SAVED_AT: {result}") + + elif args.mode == "predict": + predictor = PREDICTORS[args.model] + + model_file = args.model_path + print(f"Using model file: {model_file}") + + pred_df = predictor(df, model_file) + pred_df.to_csv(args.output, index=False) + print(f"PREDICTION_SAVED_AT: {args.output}") + +if __name__ == "__main__": + main()