UPPA-s-University-Projects
diff --git a/‎bib/references.bib‎
Lines changed: 39 additions & 8 deletions b/‎bib/references.bib‎
Lines changed: 39 additions & 8 deletions
diff --git a/‎res/AlgorithmDefinition.tex‎
Lines changed: 0 additions & 4 deletions b/‎res/AlgorithmDefinition.tex‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎sections/analysis.tex‎
Lines changed: 17 additions & 2 deletions b/‎sections/analysis.tex‎
Lines changed: 17 additions & 2 deletions
diff --git a/‎sections/conprop.tex‎
Lines changed: 7 additions & 15 deletions b/‎sections/conprop.tex‎
Lines changed: 7 additions & 15 deletions
@@ -1,4 +1,43 @@
 
+@online{tin_kam_ho_random_1995,
+	title = {Random decision forests},
+	url = {https://ieeexplore.ieee.org/document/598994},
+	abstract = {Decision trees are attractive classifiers due to their high execution speed. But trees derived with traditional methods often cannot be grown to arbitrary complexity for possible loss of generalization accuracy on unseen data. The limitation on complexity usually means suboptimal accuracy on training data. Following the principles of stochastic modeling, we propose a method to construct tree-based classifiers whose capacity can be arbitrarily expanded for increases in accuracy for both training and unseen data. The essence of the method is to build multiple trees in randomly selected subspaces of the feature space. Trees in, different subspaces generalize their classification in complementary ways, and their combined classification can be monotonically improved. The validity of the method is demonstrated through experiments on the recognition of handwritten digits.},
+	author = {Tin Kam Ho},
+	urldate = {2023-11-09},
+	date = {1995-08-14},
+	langid = {american},
+}
+
+@inproceedings{hegde_dimensionality_2016,
+	title = {Dimensionality reduction technique for developing undergraduate student dropout model using principal component analysis through R package},
+	url = {https://ieeexplore.ieee.org/abstract/document/7919670},
+	doi = {10.1109/ICCIC.2016.7919670},
+	abstract = {Every educational institute feels proud when its admission closes with expected number of students. The prospective student enters the campus with lots of hopes, dreams and expectations. When their expectations are not met or if they undergo for critical circumstances and makes them drop from their registered program. Predicting undergraduate student dropouts are a major challenge in educational system due to the multidimensionality of data. This paper focuses on dimensionality reduction of multi-behavioral attributes of a 150 students with 51 attribute to identify the factor that affects the early dropout. The dataset dimensionality is reduced through Principal Component Analysis by obtaining the Eigenvalues and Eigenvectors from the covariance matrix by transforming the original attribute into new set attribute without losing the information. Visualization is done with a help of R package factoextra and {FactoMineR}. The further dataset can be used for classification. The discovery of concealed knowledge can be used for better academic planning and early prediction of student dropout.},
+	eventtitle = {2016 {IEEE} International Conference on Computational Intelligence and Computing Research ({ICCIC})},
+	pages = {1--6},
+	booktitle = {2016 {IEEE} International Conference on Computational Intelligence and Computing Research ({ICCIC})},
+	author = {Hegde, Vinayak},
+	urldate = {2024-02-11},
+	date = {2016-12},
+	note = {{ISSN}: 2473-943X},
+	keywords = {Behavioral, Classification, Correlation, Covariance matrices, Education, Eigenvalues and eigenfunctions, {FactoMineR}, Feature extraction, Media, {PCA}, Principal component analysis, Student Survey, dropout, factoextra, undergraduate},
+}
+
+@article{vieira_corrosion_nodate,
+	title = {Corrosion Analysis and Identiﬁcation Through Integration of Machine Learning and Cyber-Physical Sensors},
+	abstract = {Corrosion poses a signiﬁcant threat to the integrity and longevity of metal structures, such as plates and buildings, impacting safety, functionality, and economic sustainability. Traditional corrosion detection methods often rely on periodic inspections, which may be time-consuming and prone to human error. Incorporating sensors on this matter improves real-time monitoring and data collection for a comprehensive understanding of the corrosion process. Utilizing machine learning algorithms allows for better analysis and prediction of datasets obtained from sensors strategically placed on structures. This aids engineers in making informed decisions on corrosion remediation. This scientiﬁc article explores innovative approaches in corrosion analysis and identiﬁcation, leveraging the synergy of machine learning technologies and cyber-physical sensors.},
+	author = {Vieira, Bruno Froelich Giora},
+	langid = {english},
+}
+
+@article{nawfal_autonomous_nodate,
+	title = {Autonomous Vehicules in Last-Mile Delivery : Facing Urban Congestion and Sustainability problems},
+	abstract = {This research paper dives into the changing role of autonomous vehicles ({AVs}) in developing last-mile delivery in urban environments. It concerns the important challenges of urban congestion and sustainability, exacerbated by the growing demands of e-commerce. The study exposes the benefits of integrating {AVs} in urban delivery systems and gives a multi-faceted approach on how to successfully achieve this integration despite its challenging complexity. The paper aims to give how we can improve the future of urban transportation and logistics by diving in the implications of {AV} integration in last-mile delivery, from different points of views.},
+	author = {Nawfal, Adil},
+	langid = {english},
+}
+
 @online{noauthor_covid-19_nodate,
 	title = {{COVID}-19 data {\textbar} {WHO} {COVID}-19 dashboard},
 	url = {https://data.who.int/dashboards/covid19/data},
@@ -674,14 +713,6 @@ @article{lee_machine_2019
 	keywords = {big data, class-imbalance, dropout, ensemble, machine learning, oversampling},
 }
 
-@online{noauthor_random_nodate,
-	title = {Random decision forests},
-	url = {https://ieeexplore.ieee.org/document/598994},
-	abstract = {Decision trees are attractive classifiers due to their high execution speed. But trees derived with traditional methods often cannot be grown to arbitrary complexity for possible loss of generalization accuracy on unseen data. The limitation on complexity usually means suboptimal accuracy on training data. Following the principles of stochastic modeling, we propose a method to construct tree-based classifiers whose capacity can be arbitrarily expanded for increases in accuracy for both training and unseen data. The essence of the method is to build multiple trees in randomly selected subspaces of the feature space. Trees in, different subspaces generalize their classification in complementary ways, and their combined classification can be monotonically improved. The validity of the method is demonstrated through experiments on the recognition of handwritten digits.},
-	urldate = {2023-11-09},
-	langid = {american},
-}
-
 @article{m_alban_she_is_with_the_faculty_of_engineering_and_applied_sciences_of_the_technical_university_cotopaxi_neural_2019,
 	title = {Neural Networks to Predict Dropout at the Universities},
 	volume = {9},
 
@@ -2,12 +2,14 @@
 \graphicspath{{\subfix{../res/}}}
 \begin{document}
 We will now do an analysis from the literature review on how we can approach the problem, using what's been already made and how we can improve on it.
+We've seen from figures \ref{fig:nb_pub, fig:nb_pub_scopus_predictstudent,fig:nb_pub_scopus_predictstudent_country,fig:nb_pub_scopus_predictstudent_subject, fig:nb_pub_scopus_predictstudent_AI, fig:nb_pub_scopus_predictstudent_country} and \ref{fig:nb_pub_scopus_predictstudent_subject} how much the field of study for early system detection for student drop-out is extensive. However, this is not the goal of our study to create a framework for a system that could both predict students dropout but more importantly success and excellence for a specific formation. 
+However, we can extrapolate and hypothesise that if such systems could beneficently be used and had been developed to predict students at risk, why a continuity in the research could lead in a one and three system to both help in the registration process, take student at risk as early as the registration and how we could help excellent student for a specific formation to achieve unleash their true potential.
+We will take our finding and consider them useful to determine student success, beginning with the factors (subsection \ref{subsubsec:soa_analyticalapproach}) to determine variables we could retrieve from the feeding data in order to help with the clustering, analysis and prediction.
 
 \subsection{Factors}
 \label{subsec:analysis_factors}
 First of all, what differentiate this research from all the other we have read throughout the literature analysis is that we are not seeking prediction on student's dropout but rather on student success and early in the process and not during the curriculum year. However, there is plenty of interesting information we can gather from these papers. As described in the \ref{sec:soa} State of Art, we can gather factors that, in theory could help predict student's dropout. We can hypothesize that by using these factors to determine if one student is at risk of dropping-out, it could for another predict its success in a specific formation. From the list of factors we were able to gather, we have made a statistical analysis of the frequency they appear and their overall score within each paper they are mention it. Below, the table from this study concluding our research.
 
-
 This part of the study showed us how difficult it was to examine our feeding data and how to define the output we are thriving for. Factors, whether they are inputs or outputs, will strongly depend on multiple factors like :
 \begin{itemize}
     \item Societal norm
@@ -17,12 +19,25 @@ \subsection{Factors}
     \item Time frame (all factors above will change over time)
 \end{itemize}
 
+\textit{This list has been summarized and grouped into 5 categories, which all includes different factors, has available to read in subsection \ref{subsubsec:soa_analyticalapproach} and in the following papers : 
+\cite{opazo_analysis_2021,tinto_dropout_1975,caspersen_teachers_2015,lidia_problema_2006,bejarano_caso_2017,sinchi_acceso_2018,cavero_voluntad_2011,velasco_alisis_nodate}}
+
 The human part (sociological part) is the most complex in this research. As human have evolved and will evolve, norms will change, and specific factors now may differ in the future. 
+As well as the factors and analytical part of the research (subsection \ref{subsubsec:soa_analyticalapproach}, we must consider the outcome we want from such a system. As discussed in subsection \ref{subsubsec:soa_humanapproach}, we must first begin by defining what we hear by meaning \textbf{success}? Because the literature is extensive and from all around the globe (see Figure \ref{fig:nb_pub_scopus_predictstudent_country, fig:nb_pub_scopus_predictstudent_country}
 The goal will not to find the correct and universal combination of factors, but rather give a framework of \acrshort{ml} algorithm and how to feed them depending on the need of the institution and the goal they are thriving towards.
 
 \subsection{Machine Learning algorithm}
 \label{subsec:analysis_mlalgo}
-Secondly, we need to understand which algorithm model have been used the most and which present the best outcome for our need. As for the factors, we can extrapolate the problem and take it in reverse. So by learning which algorithm presents the best result to predict student's dropout, we could hypothesize that they could also be used to detect student's success. 
+Secondly, we need to understand which algorithm model have been used the most and which present the best outcome for our need. As for the factors, we can extrapolate the problem and take it in reverse. So by learning which algorithm presents the best result to predict student's dropout, we could hypothesize that they could also be used to detect student's success; as already discussed throughout this paper. Many algorithm have been studied in the field, and, for our experiment, we are going to use the following ones : 
+
+\begin{enumerate}
+    \item \acrfull{nn} : To transform human data into profile other algorithm can understand. This technique has already been searched and use as we can read from : \cite{m_alban_she_is_with_the_faculty_of_engineering_and_applied_sciences_of_the_technical_university_cotopaxi_neural_2019, siri_predicting_2015, viloria_integration_2019, zhang_neural_2000}.
+    \item \acrfull{pca} : To reduce the possible dimensional issue we could encounter in such system, by the amount of data and the complexity of our model. This technique has been intensively used in analytical study, and thus has not been directly studied for a specific need of analysing student dropout or success. For this research, PCA could be invaluable. It can help identify the most significant factors affecting student success from a large dataset, reducing the number of variables you need to consider.
+    \item K-Means Clustering : To cluster and group each profile into one of our three category (\textbf{Excellent, average} or \textbf{bad}). It has efficiently been used for clustering tasks inside multiple research over the years, and not only in the field of predicting student success or failure. \cite{de_o_santos_supervised_2019, mardolkar_forecasting_2020,shiful_machine_2021}.
+    \item \acrfull{if} : It's effective in identifying outliers in data, which could be useful for detecting atypical student profiles or behaviors that deviate significantly from the norm. Such, helping find excellent and at risk students.
+    \item \acrfull{lr} : Lasso Regression could help identify the most impactful factors on student success by eliminating less relevant variables, thus simplifying your model and possibly improving its predictive performance. 
+\end{enumerate}
+
 
 \subsection{Analysis conclusion}
 \label{subsec:analysis_conclusion}
 
@@ -14,6 +14,12 @@
 \subsection{Feeding data}
 \label{subsec:conprop_feedingdata}
 Our literature survey \ref{subsubsec:soa_analyticalapproach} has identified several key factors influencing student retention and success. We can extrapolate and hypothesis such wide factors could be used to determine student's success.
+Because our available data is limited and does not answer all the need for such a system, a smaller system will be developed inside the implementation section (\ref{sec:imp}). If an institution would want to follow through the implementation of our framework inside their registration process, they would need to feed the machine (for its optimal potential) : 
+\begin{itemize}
+    \item Motivation letter : To help create profiles for each student, based on the information provided inside the motivation letter. (Analysed and used by our \acrfull{nn} model).
+    \item Previous academic results : To help determine how the student react to casual study systems as well as to determine the determination of this student about studies and infrastructures. Contrary to many studies, grades won't be used solely to determine one's capacity. It's been now proven that our system casually found throughout the globe doesn't help really good student elevate to their full potential; but rather create group of normalize students ready for work, clustering them into group of \textit{simple worker} to \textit{high end careerists.} These results and history will both be used by the neural network as well as the \acrfull{pca}, K-Mean Clustering and \acrfull{if} to help provide a good dataset for the final supervised models, which are going to select the student based on the chosen outcomes.
+    \item Formation information : Because our system needs to be set for each formation and diploma to effectively give the best result, institutions will need to feed it information about the formation we want to trail the registrar against, as well as to determine the old student's profile to determine which profiles corresponds the most to the specific formation the machine will evaluate. 
+\end{itemize}
 
 \subsection{Data workflow}
 \label{subsec:concimp_dataworkflow}
@@ -29,20 +35,6 @@ \subsection{Data workflow}
     \item Lasso Regression: To perform feature selection, enhancing model interoperability by isolating significant predictors.
 \end{enumerate}
 
-\subsection{Available dataset}
-\label{subsec:available_ds}
-The dataset available to us to test this experiment was gently given by the \acrfull{uppa}.
-This dataset is composed of every student registration for the SIGLIS Master at the \acrshort{uppa} for the academic year from 2019 to 2024.
-
-The available dataset for our experiment will be processed to align with these factors, ensuring that each is represented accurately to serve as a foundation for our predictive models.
-\begin{itemize}
-    \item Previous educational background : What is this individual background on an educational level? What was their last diploma, which level are they on? 
-    \item Academic potential : Do they have already been approached as potential excellent student?
-    \item Educational performance : Have they proven performant on an educational level already? How were they previous performance?
-    \item Institutional commitment : Do they commit to their success and to the institutional life? Or do they only go in class and do the bare minimum?
-\end{itemize}
-
-
 \subsection{Validation and Expected Outcomes}
 \label{subsec:concimp_validexcpecoutcomes}
  We anticipate that this workflow will yield a robust model capable of identifying excellent students. We will gauge the efficiency of our model through rigorous validation techniques such as \acrfull{roc}, \acrfull{pca}, etc. to ensure the reliability of our predictions. 
@@ -69,7 +61,7 @@ \subsection{Usage on the field}
 This model should not be used to exclude students, but should only point out the best result for each formation to reduce dropout rates throughout the country. However, other factors outside this model should be studied by administration for each institution for each formation to correctly select student not only based on the result indicated by our result but also taking into account these other factors. Moreover, any person using this model should not understand which information is used and how by the model itself. People selecting student's should not have anything to do with the registration data, the feeding process and the output of the model. To make this process as imperial as possible, this people should only receive the output without prior knowledge and the resource they usually get to chose student to do their selection.  
 Finally, as said earlier in this paper to protect students, no personal information and their identity (name, surname, country of origin, etc.) should not be involved in the process and the information shared should not allow anyone to trace back the student based on the information given.That is why having compartmentalisation is really important if this system should be put in place in any institution.
 
-\begin{figure}[H]
+\begin{figure}
     \centering
     \includegraphics[width=1\linewidth]{res//diagram/Fonc imp.png}
     \caption{Implementation on the field of the framework (proposal}