UPPA-s-University-Projects
diff --git a/‎bib/output.aux‎
Lines changed: 250 additions & 0 deletions b/‎bib/output.aux‎
Lines changed: 250 additions & 0 deletions
diff --git a/‎bib/references.bib‎
Lines changed: 83 additions & 17 deletions b/‎bib/references.bib‎
Lines changed: 83 additions & 17 deletions
diff --git a/‎res/diagram/MLv1.png‎
73.4 KB b/‎res/diagram/MLv1.png‎
73.4 KB
diff --git a/‎res/graph/data_analysis/imp v1/ScatterPlot_cluster2.png‎
67.3 KB b/‎res/graph/data_analysis/imp v1/ScatterPlot_cluster2.png‎
67.3 KB
diff --git a/‎res/graph/data_analysis/imp v1/centroid_charts.png‎
74.7 KB b/‎res/graph/data_analysis/imp v1/centroid_charts.png‎
74.7 KB
diff --git a/‎sections/analysis.tex‎
Lines changed: 10 additions & 3 deletions b/‎sections/analysis.tex‎
Lines changed: 10 additions & 3 deletions
@@ -1,4 +1,87 @@
 
+@article{buda_systematic_2018,
+	title = {A systematic study of the class imbalance problem in convolutional neural networks},
+	volume = {106},
+	issn = {0893-6080},
+	url = {https://www.sciencedirect.com/science/article/pii/S0893608018302107},
+	doi = {10.1016/j.neunet.2018.07.011},
+	abstract = {In this study, we systematically investigate the impact of class imbalance on classification performance of convolutional neural networks ({CNNs}) and compare frequently used methods to address the issue. Class imbalance is a common problem that has been comprehensively studied in classical machine learning, yet very limited systematic research is available in the context of deep learning. In our study, we use three benchmark datasets of increasing complexity, {MNIST}, {CIFAR}-10 and {ImageNet}, to investigate the effects of imbalance on classification and perform an extensive comparison of several methods to address the issue: oversampling, undersampling, two-phase training, and thresholding that compensates for prior class probabilities. Our main evaluation metric is area under the receiver operating characteristic curve ({ROC} {AUC}) adjusted to multi-class tasks since overall accuracy metric is associated with notable difficulties in the context of imbalanced data. Based on results from our experiments we conclude that (i) the effect of class imbalance on classification performance is detrimental; (ii) the method of addressing class imbalance that emerged as dominant in almost all analyzed scenarios was oversampling; (iii) oversampling should be applied to the level that completely eliminates the imbalance, whereas the optimal undersampling ratio depends on the extent of imbalance; (iv) as opposed to some classical machine learning models, oversampling does not cause overfitting of {CNNs}; (v) thresholding should be applied to compensate for prior class probabilities when overall number of properly classified cases is of interest.},
+	pages = {249--259},
+	journaltitle = {Neural Networks},
+	shortjournal = {Neural Networks},
+	author = {Buda, Mateusz and Maki, Atsuto and Mazurowski, Maciej A.},
+	urldate = {2024-02-13},
+	date = {2018-10-01},
+	keywords = {Class imbalance, Convolutional neural networks, Deep learning, Image classification},
+}
+
+@article{beyan_classifying_2015,
+	title = {Classifying imbalanced data sets using similarity based hierarchical decomposition},
+	volume = {48},
+	issn = {0031-3203},
+	url = {https://www.sciencedirect.com/science/article/pii/S003132031400449X},
+	doi = {10.1016/j.patcog.2014.10.032},
+	abstract = {Classification of data is difficult if the data is imbalanced and classes are overlapping. In recent years, more research has started to focus on classification of imbalanced data since real world data is often skewed. Traditional methods are more successful with classifying the class that has the most samples (majority class) compared to the other classes (minority classes). For the classification of imbalanced data sets, different methods are available, although each has some advantages and shortcomings. In this study, we propose a new hierarchical decomposition method for imbalanced data sets which is different from previously proposed solutions to the class imbalance problem. Additionally, it does not require any data pre-processing step as many other solutions need. The new method is based on clustering and outlier detection. The hierarchy is constructed using the similarity of labeled data subsets at each level of the hierarchy with different levels being built by different data and feature subsets. Clustering is used to partition the data while outlier detection is utilized to detect minority class samples. The comparison of the proposed method with state of art the methods using 20 public imbalanced data sets and 181 synthetic data sets showed that the proposed method׳s classification performance is better than the state of art methods. It is especially successful if the minority class is sparser than the majority class. It has accurate performance even when classes have sub-varieties and minority and majority classes are overlapping. Moreover, its performance is also good when the class imbalance ratio is low, i.e. classes are more imbalanced.},
+	pages = {1653--1672},
+	number = {5},
+	journaltitle = {Pattern Recognition},
+	shortjournal = {Pattern Recognition},
+	author = {Beyan, Cigdem and Fisher, Robert},
+	urldate = {2024-02-13},
+	date = {2015-05-01},
+	keywords = {Class imbalance problem, Clustering, Hierarchical decomposition, Minority–majority classes, Outlier detection},
+}
+
+@article{haixiang_bpso-adaboost-knn_2016,
+	title = {{BPSO}-Adaboost-{KNN} ensemble learning algorithm for multi-class imbalanced data classification},
+	volume = {49},
+	issn = {0952-1976},
+	url = {https://www.sciencedirect.com/science/article/pii/S0952197615002110},
+	doi = {10.1016/j.engappai.2015.09.011},
+	abstract = {This paper proposes an ensemble algorithm named of {BPSO}-Adaboost-{KNN} to cope with multi-class imbalanced data classification. The main idea of this algorithm is to integrate feature selection and boosting into ensemble. What’s more, we utilize a novel evaluation metric called {AUCarea} which is especially for multi-class classification. In our model {BPSO} is employed as the feature selection algorithm in which {AUCarea} is chosen as the fitness. For classification, we generate a boosting classifier in which {KNN} is selected as the basic classifier. In order to verify the effectiveness of our method, 19 benchmarks are used in our experiments. The results show that the proposed algorithm improves both the stability and the accuracy of boosting after carrying out feature selection, and the performance of our algorithm is comparable with other state-of-the-art algorithms. In statistical analyses, we apply Bland–Altman analysis to show the consistencies between {AUCarea} and other popular metrics like average G-mean, average F-value etc. Besides, we use linear regression to find deeper correlation between {AUCarea} and other metrics in order to show why {AUCarea} works well in this issue. We also put out a series of statistical studies in order to analyze if there exist significant improvements after feature selection and boosting are employed. At last, the proposed algorithm is applied in oil-bearing of reservoir recognition. The classification precision is up to 99\% in oilsk81-oilsk85 well logging data in Jianghan oilfield of China, which is 20\% higher than {KNN} classifier. Particularly, the proposed algorithm has significant superiority when distinguishing the oil layer from other layers.},
+	pages = {176--193},
+	journaltitle = {Engineering Applications of Artificial Intelligence},
+	shortjournal = {Engineering Applications of Artificial Intelligence},
+	author = {Haixiang, Guo and Yijing, Li and Yanan, Li and Xiao, Liu and Jinling, Li},
+	urldate = {2024-02-13},
+	date = {2016-03-01},
+	keywords = {Classification, Ensemble, Feature selection, Imbalanced data, Oil reservoir},
+}
+
+@article{gong_rhsboost_2017,
+	title = {{RHSBoost}: Improving classification performance in imbalance data},
+	volume = {111},
+	issn = {0167-9473},
+	url = {https://www.sciencedirect.com/science/article/pii/S016794731730018X},
+	doi = {10.1016/j.csda.2017.01.005},
+	shorttitle = {{RHSBoost}},
+	abstract = {Imbalance data are defined as a dataset whose proportion of classes is severely skewed. Classification performance of existing models tends to deteriorate due to class distribution imbalance. In addition, over-representation by majority classes prevents a classifier from paying attention to minority classes, which are generally more interesting. An effective ensemble classification method called {RHSBoost} has been proposed to address the imbalance classification problem. This classification rule uses random undersampling and {ROSE} sampling under a boosting scheme. According to the experimental results, {RHSBoost} appears to be an attractive classification model for imbalance data.},
+	pages = {1--13},
+	journaltitle = {Computational Statistics \& Data Analysis},
+	shortjournal = {Computational Statistics \& Data Analysis},
+	author = {Gong, Joonho and Kim, Hyunjoong},
+	urldate = {2024-02-13},
+	date = {2017-07-01},
+	keywords = {{AUC}, {AdaBoost}, Ensemble, Imbalanced data, {RHSBoost}, Undersampling},
+}
+
+@article{haixiang_learning_2017,
+	title = {Learning from class-imbalanced data: Review of methods and applications},
+	volume = {73},
+	issn = {0957-4174},
+	url = {https://www.sciencedirect.com/science/article/pii/S0957417416307175},
+	doi = {10.1016/j.eswa.2016.12.035},
+	shorttitle = {Learning from class-imbalanced data},
+	abstract = {Rare events, especially those that could potentially negatively impact society, often require humans’ decision-making responses. Detecting rare events can be viewed as a prediction task in data mining and machine learning communities. As these events are rarely observed in daily life, the prediction task suffers from a lack of balanced data. In this paper, we provide an in depth review of rare event detection from an imbalanced learning perspective. Five hundred and seventeen related papers that have been published in the past decade were collected for the study. The initial statistics suggested that rare events detection and imbalanced learning are concerned across a wide range of research areas from management science to engineering. We reviewed all collected papers from both a technical and a practical point of view. Modeling methods discussed include techniques such as data preprocessing, classification algorithms and model evaluation. For applications, we first provide a comprehensive taxonomy of the existing application domains of imbalanced learning, and then we detail the applications for each category. Finally, some suggestions from the reviewed papers are incorporated with our experiences and judgments to offer further research directions for the imbalanced learning and rare event detection fields.},
+	pages = {220--239},
+	journaltitle = {Expert Systems with Applications},
+	shortjournal = {Expert Systems with Applications},
+	author = {Haixiang, Guo and Yijing, Li and Shang, Jennifer and Mingyun, Gu and Yuanyue, Huang and Bing, Gong},
+	urldate = {2023-12-17},
+	date = {2017-05-01},
+	keywords = {Data mining, Imbalanced data, Machine learning, Rare events},
+}
+
 @online{tin_kam_ho_random_1995,
 	title = {Random decision forests},
 	url = {https://ieeexplore.ieee.org/document/598994},
@@ -220,23 +303,6 @@ @online{noauthor_quest-ce_nodate
 	langid = {french},
 }
 
-@article{haixiang_learning_2017,
-	title = {Learning from class-imbalanced data: Review of methods and applications},
-	volume = {73},
-	issn = {0957-4174},
-	url = {https://www.sciencedirect.com/science/article/pii/S0957417416307175},
-	doi = {10.1016/j.eswa.2016.12.035},
-	shorttitle = {Learning from class-imbalanced data},
-	abstract = {Rare events, especially those that could potentially negatively impact society, often require humans’ decision-making responses. Detecting rare events can be viewed as a prediction task in data mining and machine learning communities. As these events are rarely observed in daily life, the prediction task suffers from a lack of balanced data. In this paper, we provide an in depth review of rare event detection from an imbalanced learning perspective. Five hundred and seventeen related papers that have been published in the past decade were collected for the study. The initial statistics suggested that rare events detection and imbalanced learning are concerned across a wide range of research areas from management science to engineering. We reviewed all collected papers from both a technical and a practical point of view. Modeling methods discussed include techniques such as data preprocessing, classification algorithms and model evaluation. For applications, we first provide a comprehensive taxonomy of the existing application domains of imbalanced learning, and then we detail the applications for each category. Finally, some suggestions from the reviewed papers are incorporated with our experiences and judgments to offer further research directions for the imbalanced learning and rare event detection fields.},
-	pages = {220--239},
-	journaltitle = {Expert Systems with Applications},
-	shortjournal = {Expert Systems with Applications},
-	author = {Haixiang, Guo and Yijing, Li and Shang, Jennifer and Mingyun, Gu and Yuanyue, Huang and Bing, Gong},
-	urldate = {2023-12-17},
-	date = {2017-05-01},
-	keywords = {Data mining, Imbalanced data, Machine learning, Rare events},
-}
-
 @article{galar_review_2012,
 	title = {A Review on Ensembles for the Class Imbalance Problem: Bagging-, Boosting-, and Hybrid-Based Approaches},
 	volume = {42},
 
@@ -2,7 +2,14 @@
 \graphicspath{{\subfix{../res/}}}
 \begin{document}
 We will now do an analysis from the literature review on how we can approach the problem, using what's been already made and how we can improve on it.
-We've seen from figures \ref{fig:nb_pub, fig:nb_pub_scopus_predictstudent,fig:nb_pub_scopus_predictstudent_country,fig:nb_pub_scopus_predictstudent_subject, fig:nb_pub_scopus_predictstudent_AI, fig:nb_pub_scopus_predictstudent_country} and \ref{fig:nb_pub_scopus_predictstudent_subject} how much the field of study for early system detection for student drop-out is extensive. However, this is not the goal of our study to create a framework for a system that could both predict students dropout but more importantly success and excellence for a specific formation. 
+We've seen from figures \ref{fig:nb_pub}
+\ref{fig:nb_pub_scopus_predictstudent}
+\ref{fig:nb_pub_scopus_predictstudent_AI}
+\ref{fig:nb_pub_scopus_predictstudent_country}
+\ref{fig:nb_pub_scopus_predictstudent_country_AI}
+\ref{fig:nb_pub_scopus_predictstudent_subject}
+\ref{fig:nb_pub_scopus_predictstudent_subject_AI}
+and \ref{fig:nb_pub_scopus_predictstudent_subject} how much the field of study for early system detection for student drop-out is extensive. However, this is not the goal of our study to create a framework for a system that could both predict students dropout but more importantly success and excellence for a specific formation. 
 However, we can extrapolate and hypothesise that if such systems could beneficently be used and had been developed to predict students at risk, why a continuity in the research could lead in a one and three system to both help in the registration process, take student at risk as early as the registration and how we could help excellent student for a specific formation to achieve unleash their true potential.
 We will take our finding and consider them useful to determine student success, beginning with the factors (subsection \ref{subsubsec:soa_analyticalapproach}) to determine variables we could retrieve from the feeding data in order to help with the clustering, analysis and prediction.
 
@@ -23,7 +30,7 @@ \subsection{Factors}
 \cite{opazo_analysis_2021,tinto_dropout_1975,caspersen_teachers_2015,lidia_problema_2006,bejarano_caso_2017,sinchi_acceso_2018,cavero_voluntad_2011,velasco_alisis_nodate}}
 
 The human part (sociological part) is the most complex in this research. As human have evolved and will evolve, norms will change, and specific factors now may differ in the future. 
-As well as the factors and analytical part of the research (subsection \ref{subsubsec:soa_analyticalapproach}, we must consider the outcome we want from such a system. As discussed in subsection \ref{subsubsec:soa_humanapproach}, we must first begin by defining what we hear by meaning \textbf{success}? Because the literature is extensive and from all around the globe (see Figure \ref{fig:nb_pub_scopus_predictstudent_country, fig:nb_pub_scopus_predictstudent_country}
+As well as the factors and analytical part of the research (subsection \ref{subsubsec:soa_analyticalapproach}, we must consider the outcome we want from such a system. As discussed in subsection \ref{subsubsec:soa_humanapproach}, we must first begin by defining what we hear by meaning \textbf{success}? Because the literature is extensive and from all around the globe (see Figure \ref{fig:nb_pub_scopus_predictstudent_country}, \ref{fig:nb_pub_scopus_predictstudent_country}
 The goal will not to find the correct and universal combination of factors, but rather give a framework of \acrshort{ml} algorithm and how to feed them depending on the need of the institution and the goal they are thriving towards.
 
 \subsection{Machine Learning algorithm}
@@ -43,5 +50,5 @@ \subsection{Analysis conclusion}
 \label{subsec:analysis_conclusion}
 Both our hypothesis and result must now be verified by providing a methodology and using a test dataset to send to our pipeline in order to feed our machines.
 We may find that one or both hypothesis are not correct and we will need to restudy factors and machine learning algorithm to answer our need and problematic. 
-In the next part, \ref{sec:conceptualanalysis} Conceptual implementation, we are going to present our methodology and workflow. Explaining the reasons for our choice of factors and algorithm as well as presenting our entire pipeline for our system.
+In the next part, Conceptual implementation, we are going to present our methodology and workflow. Explaining the reasons for our choice of factors and algorithm as well as presenting our entire pipeline for our system.
 \end{document}