sailuh · haotian1028 · Mar 31, 2025 · May 3, 2025 · May 6, 2025 · May 9, 2025
diff --git a/api/filter.py b/api/filter.py
@@ -0,0 +1,64 @@
+import sys
+import re
+from unicodedata import category
+from bs4 import BeautifulSoup
+from markdown import markdown
+
+
+USERNAME_REGEX = r"(\s|^)@(\S*\s?)"
+
+# Generate Unicode punctuation set
+punctuation = {chr(i) for i in range(sys.maxunicode + 1) if category(chr(i)).startswith(("P", "S"))}
+
+# Dictionary to count token replacements
+counters = {}
+
+def remove_punctuation(text):
+    """
+    Remove all punctuation characters from the given text.
+
+    Args:
+        text (str): The input text.
+
+    Returns:
+        str: The text without any punctuation.
+    """
+    return "".join(char for char in text if char not in punctuation)
+
+def clean_text(text):
+    """
+    Remove quoted text and large code blocks from GitHub issues or comments.
+
+    This function performs the following clean-up:
+    - Removes quoted email/notification text from GitHub.
+    - Removes code blocks enclosed in triple backticks.
+
+    Args:
+        text (str): The input text (typically from a GitHub issue or comment).
+
+    Returns:
+        str: The cleaned text without quoted text or code blocks.
+    """
+    # Remove quoted text from emails/notifications
+    text = re.sub(r"^(On[\s\S]*?notifications@github\.com\s*?wrote:\s*?)?(^(\>).*\s)*", '', text, flags=re.MULTILINE)
+
+    # Remove code blocks enclosed in triple backticks
+    text = re.sub(r"```[a-z]*\n[\s\S]*?\n```", "", text)
+
+    return text
+
+def remove_markdown_content(text):
+    """
+    Converts Markdown content to plain text by removing all Markdown formatting.
+
+    This function processes the input Markdown text and converts it to plain text
+    by removing all Markdown syntax.
+
+    Args:
+        text (str): The input Markdown text.
+
+    Returns:
+        str: Cleaned text without Markdown formatting.
+    """
+    html = markdown(text)
+    return "".join(BeautifulSoup(html, "lxml").findAll(text=True))
diff --git a/api/test.py b/api/test.py
@@ -0,0 +1,121 @@
+import os
+import re
+import string
+import random
+import warnings
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import time
+import seaborn as sns
+import matplotlib.pyplot as plt
+from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
+from io import StringIO
+from unicodedata import category
+from bs4 import BeautifulSoup
+from markdown import markdown
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score,classification_report
+from torch.utils.data import DataLoader, RandomSampler, Dataset
+from transformers import (
+    BertTokenizer, BertForSequenceClassification, BertForMaskedLM,
+    XLNetTokenizer, XLNetForSequenceClassification,
+    RobertaTokenizer, RobertaForSequenceClassification, RobertaForMaskedLM,
+    AlbertTokenizer, AlbertForSequenceClassification, AlbertForMaskedLM,
+    get_scheduler
+)
+from torch.optim import AdamW
+from api.train import *
+
+def test_model(test_df, model_saved_path, model_select=0):
+
+  MODELS = [(BertForSequenceClassification,BertTokenizer,'bert-base-cased'),
+          (XLNetForSequenceClassification, XLNetTokenizer,'xlnet-base-cased'),
+          (RobertaForSequenceClassification, RobertaTokenizer,'roberta-base'),
+          (AlbertForSequenceClassification, AlbertTokenizer,'albert-base-v1')
+        ]
+  MODEL_NAMES = ['bert', 'xlnet', 'Roberta', 'albert']
+  seed_torch(42)
+
+  cur_model=MODELS[model_select]
+  m_name=MODEL_NAMES[model_select]
+
+  tokenizer = cur_model[1].from_pretrained(cur_model[2], do_lower_case=True)
+
+  begin=time.time()
+
+  test_df['Polarity']=test_df['Polarity'].replace({
+      'positive':1,
+      'negative':2,
+      'neutral':0})
+
+
+  sentences = test_df.Text.values
+  labels = test_df.Polarity.values
+
+  input_ids = []
+  attention_masks = []
+
+  for sent in sentences:
+      encoded_dict = tokenizer.encode_plus(
+                          str(sent),
+                          add_special_tokens = True,
+                          max_length = MAX_LEN,
+                          pad_to_max_length = True,
+                          return_attention_mask = True,
+                          return_tensors = 'pt',
+                    )
+
+      input_ids.append(encoded_dict['input_ids'])
+      attention_masks.append(encoded_dict['attention_mask'])
+
+  prediction_inputs = torch.cat(input_ids,dim=0)
+  prediction_masks = torch.cat(attention_masks,dim=0)
+  prediction_labels = torch.tensor(labels)
+
+  prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
+  prediction_sampler = SequentialSampler(prediction_data)
+  prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=BATCH_SIZE)
+
+  model = cur_model[0].from_pretrained(cur_model[2], num_labels=3)
+  model.load_state_dict(torch.load(model_saved_path))
+# model.cuda()
+  model.eval()
+
+  predictions,true_labels=[],[]
+
+  for batch in prediction_dataloader:
+      batch = tuple(t.to(device) for t in batch)
+      b_input_ids, b_input_mask, b_labels = batch
+
+      with torch.no_grad():
+          outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
+          logits = outputs[0]
+
+      logits = logits.detach().cpu().numpy()
+      label_ids = b_labels.to('cpu').numpy()
+
+      predictions.append(logits)
+      true_labels.append(label_ids)
+
+  end=time.time()
+  print('Prediction used {:.2f} seconds'.format(end - begin))
+
+  flat_predictions = [item for sublist in predictions for item in sublist]
+  flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
+  flat_true_labels = [item for sublist in true_labels for item in sublist]
+
+  print("Accuracy of {} is: {}".format(m_name, accuracy_score(flat_true_labels,flat_predictions)))
+
+  print(classification_report(flat_true_labels,flat_predictions))
+
+
+  df_prediction = pd.DataFrame(flat_predictions, columns=['prediction_Polarity'])
+
+  df_combined = pd.concat([test_df, df_prediction], axis=1)
+
+  counts = df_combined['prediction_Polarity'].value_counts()
+  print(counts)
+
+  return df_combined
diff --git a/api/tokenizer.py b/api/tokenizer.py
@@ -0,0 +1,99 @@
+import re
+import warnings
+
+# Dictionary to count token replacements
+counters = {}
+
+
+def replace_token(regex, token_name, text):
+    """
+    Replace matched patterns in the text with the specified token.
+
+    This function uses regular expressions to find occurrences of the pattern
+    and replaces them with a token name. The number of replacements made is counted.
+
+    Args:
+        regex (str): The regular expression pattern to match.
+        token_name (str): The replacement token name.
+        text (str): The input text.
+
+    Returns:
+        tuple: A tuple containing:
+            - str: The text with the tokens replacing the matches.
+            - int: The number of replacements made.
+    """
+    replaced_text, replacements = re.subn(regex, f" {token_name} ", text, flags=re.MULTILINE)
+    counters[token_name] = counters.get(token_name, 0) + replacements
+    return replaced_text, replacements
+
+def tokenize_text(text):
+    """
+    Tokenizes a given text by replacing specific elements such as emails, mentions, URLs, etc.
+
+    This function processes the input text and replaces various elements, such as:
+    - Email addresses (replaced with 'MEMAIL').
+    - GitHub mentions (replaced with 'MMENTION').
+    - Code blocks (replaced with 'MICODE').
+    - Version numbers (replaced with 'MVERSIONNUMBER').
+    - Issue mentions (replaced with 'MISSUEMENTION').
+    - URLs (replaced with 'MURL').
+
+    Args:
+        text (str): The input text.
+
+    Returns:
+        tuple: A tuple containing:
+            - str: The tokenized text.
+            - int: The total number of replacements made.
+    """
+    total_replacements = 0
+
+    text, replacements = replace_token(r"\S+@\S*\s?", "MEMAIL", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(USERNAME_REGEX, "MMENTION", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(r"`([^`]*)`", "MICODE", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(r"\b\d+\.\d+(\.\d+)*\b", "MVERSIONNUMBER", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(r"(\s|^)#\d+", "MISSUEMENTION", text)
+    total_replacements += replacements
+
+    text, replacements = replace_token(
+        r"([a-zA-Z0-9]+):\/\/([\w_-]+(?:\.[\w_-]+)*)[\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-]",
+        "MURL",
+        text,
+    )
+    total_replacements += replacements
+
+    return text, total_replacements
+
+def transform_text(row):
+    """
+    Transforms a row by cleaning and tokenizing its text content.
+
+    This function extracts the "Text" key from the input dictionary and processes
+    it using the `tokenize_text` function. The text is also cleaned by removing
+    newline characters.
+
+    Args:
+        row (dict): A dictionary containing a 'Text' key.
+
+    Returns:
+        tuple: A tuple containing:
+            - str: The processed text after cleaning and tokenization.
+            - int: The number of replacements made.
+    """
+    text = row.get("Text", "")
+
+    if not isinstance(text, str):
+        warnings.warn(f"Converting non-string type to string: {type(text)}")
+        text = str(text)
+
+    text, replaced_count = tokenize_text(text)
+    text = text.replace("\n", "")
+    return text, replaced_count