Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 64 additions & 0 deletions api/filter.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: This filter.py functions are not used in the entire original notebook, and therefore it is not used in these 3 refactored notebooks.

Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import sys
import re
from unicodedata import category
from bs4 import BeautifulSoup
from markdown import markdown


USERNAME_REGEX = r"(\s|^)@(\S*\s?)"

# Generate Unicode punctuation set
punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith(("P", "S"))}

# Dictionary to count token replacements
counters = {}

def remove_punctuation(text):
"""
Remove all punctuation characters from the given text.

Args:
text (str): The input text.

Returns:
str: The text without any punctuation.
"""
return "".join(char for char in text if char not in punctuation)

def clean_text(text):
"""
Remove quoted text and large code blocks from GitHub issues or comments.

This function performs the following clean-up:
- Removes quoted email/notification text from GitHub.
- Removes code blocks enclosed in triple backticks.

Args:
text (str): The input text (typically from a GitHub issue or comment).

Returns:
str: The cleaned text without quoted text or code blocks.
"""
# Remove quoted text from emails/notifications
text = re.sub(r"^(On[\s\S]*?notifications@github\.com\s*?wrote:\s*?)?(^(\>).*\s)*", '', text, flags=re.MULTILINE)

# Remove code blocks enclosed in triple backticks
text = re.sub(r"```[a-z]*\n[\s\S]*?\n```", "", text)

return text

def remove_markdown_content(text):
"""
Converts Markdown content to plain text by removing all Markdown formatting.

This function processes the input Markdown text and converts it to plain text
by removing all Markdown syntax.

Args:
text (str): The input Markdown text.

Returns:
str: Cleaned text without Markdown formatting.
"""
html = markdown(text)
return "".join(BeautifulSoup(html, "lxml").findAll(text=True))
121 changes: 121 additions & 0 deletions api/test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import os
import re
import string
import random
import warnings
import argparse
import numpy as np
import pandas as pd
import torch
import time
import seaborn as sns
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from io import StringIO
from unicodedata import category
from bs4 import BeautifulSoup
from markdown import markdown
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
from torch.utils.data import DataLoader, RandomSampler, Dataset
from transformers import (
BertTokenizer, BertForSequenceClassification, BertForMaskedLM,
XLNetTokenizer, XLNetForSequenceClassification,
RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,
AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,
get_scheduler
)
from torch.optim import AdamW
from api.train import *

def test_model(test_df, model_saved_path, model_select=0):

MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
(XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
(RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
(AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
]
MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
seed_torch(42)

cur_model=MODELS[model_select]
m_name=MODEL_NAMES[model_select]

tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)

begin=time.time()

test_df['Polarity']=test_df['Polarity'].replace({
'positive':1,
'negative':2,
'neutral':0})


sentences = test_df.Text.values
labels = test_df.Polarity.values

input_ids = []
attention_masks = []

for sent in sentences:
encoded_dict = tokenizer.encode_plus(
str(sent),
add_special_tokens = True,
max_length = MAX_LEN,
pad_to_max_length = True,
return_attention_mask = True,
return_tensors = 'pt',
)

input_ids.append(encoded_dict['input_ids'])
attention_masks.append(encoded_dict['attention_mask'])

prediction_inputs = torch.cat(input_ids,dim=0)
prediction_masks = torch.cat(attention_masks,dim=0)
prediction_labels = torch.tensor(labels)

prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)

model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
model.load_state_dict(torch.load(model_saved_path))
# model.cuda()
model.eval()

predictions,true_labels=[],[]

for batch in prediction_dataloader:
batch = tuple(t.to(device) for t in batch)
b_input_ids, b_input_mask, b_labels = batch

with torch.no_grad():
outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
logits = outputs[0]

logits = logits.detach().cpu().numpy()
label_ids = b_labels.to('cpu').numpy()

predictions.append(logits)
true_labels.append(label_ids)

end=time.time()
print('Prediction used {:.2f} seconds'.format(end - begin))

flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
flat_true_labels = [item for sublist in true_labels for item in sublist]

print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))

print(classification_report(flat_true_labels,flat_predictions))


df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])

df_combined = pd.concat([test_df, df_prediction], axis=1)

counts = df_combined['prediction_Polarity'].value_counts()
print(counts)

return df_combined
99 changes: 99 additions & 0 deletions api/tokenizer.py
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Note to self: This tokenizer.py functions are not used in the entire original notebook, and therefore it is not used in these 3 refactored notebooks.

Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
import re
import warnings

# Dictionary to count token replacements
counters = {}


def replace_token(regex, token_name, text):
"""
Replace matched patterns in the text with the specified token.

This function uses regular expressions to find occurrences of the pattern
and replaces them with a token name. The number of replacements made is counted.

Args:
regex (str): The regular expression pattern to match.
token_name (str): The replacement token name.
text (str): The input text.

Returns:
tuple: A tuple containing:
- str: The text with the tokens replacing the matches.
- int: The number of replacements made.
"""
replaced_text, replacements = re.subn(regex, f" {token_name} ", text, flags=re.MULTILINE)
counters[token_name] = counters.get(token_name, 0) + replacements
return replaced_text, replacements

def tokenize_text(text):
"""
Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.

This function processes the input text and replaces various elements, such as:
- Email addresses (replaced with 'MEMAIL').
- GitHub mentions (replaced with 'MMENTION').
- Code blocks (replaced with 'MICODE').
- Version numbers (replaced with 'MVERSIONNUMBER').
- Issue mentions (replaced with 'MISSUEMENTION').
- URLs (replaced with 'MURL').

Args:
text (str): The input text.

Returns:
tuple: A tuple containing:
- str: The tokenized text.
- int: The total number of replacements made.
"""
total_replacements = 0

text, replacements = replace_token(r"\S+@\S*\s?", "MEMAIL", text)
total_replacements += replacements

text, replacements = replace_token(USERNAME_REGEX, "MMENTION", text)
total_replacements += replacements

text, replacements = replace_token(r"`([^`]*)`", "MICODE", text)
total_replacements += replacements

text, replacements = replace_token(r"\b\d+\.\d+(\.\d+)*\b", "MVERSIONNUMBER", text)
total_replacements += replacements

text, replacements = replace_token(r"(\s|^)#\d+", "MISSUEMENTION", text)
total_replacements += replacements

text, replacements = replace_token(
r"([a-zA-Z0-9]+):\/\/([\w_-]+(?:\.[\w_-]+)*)[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]",
"MURL",
text,
)
total_replacements += replacements

return text, total_replacements

def transform_text(row):
"""
Transforms a row by cleaning and tokenizing its text content.

This function extracts the "Text" key from the input dictionary and processes
it using the `tokenize_text` function. The text is also cleaned by removing
newline characters.

Args:
row (dict): A dictionary containing a 'Text' key.

Returns:
tuple: A tuple containing:
- str: The processed text after cleaning and tokenization.
- int: The number of replacements made.
"""
text = row.get("Text", "")

if not isinstance(text, str):
warnings.warn(f"Converting non-string type to string: {type(text)}")
text = str(text)

text, replaced_count = tokenize_text(text)
text = text.replace("\n", "")
return text, replaced_count
Loading