Skip to content
This repository was archived by the owner on Sep 29, 2025. It is now read-only.

Commit e6e4e53

Browse files
committed
Analysis + Conceptual proposal.
1 parent d68b307 commit e6e4e53

17 files changed

+265
-72
lines changed

bib/references.bib

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,95 @@
11

2+
@article{song_decision_2015,
3+
title = {Decision tree methods: applications for classification and prediction},
4+
volume = {27},
5+
issn = {1002-0829},
6+
url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4466856/},
7+
doi = {10.11919/j.issn.1002-0829.215044},
8+
shorttitle = {Decision tree methods},
9+
abstract = {Decision tree methodology is a commonly used data mining method for establishing classification systems based on multiple covariates or for developing prediction algorithms for a target variable. This method classifies a population into branch-like segments that construct an inverted tree with a root node, internal nodes, and leaf nodes. The algorithm is non-parametric and can efficiently deal with large, complicated datasets without imposing a complicated parametric structure. When the sample size is large enough, study data can be divided into training and validation datasets. Using the training dataset to build a decision tree model and a validation dataset to decide on the appropriate tree size needed to achieve the optimal final model. This paper introduces frequently used algorithms used to develop decision trees (including {CART}, C4.5, {CHAID}, and {QUEST}) and describes the {SPSS} and {SAS} programs that can be used to visualize tree structure.},
10+
pages = {130--135},
11+
number = {2},
12+
journaltitle = {Shanghai Archives of Psychiatry},
13+
shortjournal = {Shanghai Arch Psychiatry},
14+
author = {{SONG}, Yan-yan and {LU}, Ying},
15+
urldate = {2023-12-17},
16+
date = {2015-04-25},
17+
pmid = {26120265},
18+
pmcid = {PMC4466856},
19+
}
20+
21+
@book{rokach_data_2015,
22+
location = {Hackensack, New Jersey},
23+
edition = {Second edition},
24+
title = {Data mining with decision trees: theory and applications},
25+
isbn = {978-981-4590-07-5},
26+
shorttitle = {Data mining with decision trees},
27+
pagetotal = {305},
28+
publisher = {World Scientific},
29+
author = {Rokach, Lior and Maimon, Oded},
30+
date = {2015},
31+
langid = {english},
32+
keywords = {Data mining, Decision support systems, Decision trees, Machine learning},
33+
}
34+
35+
@collection{hofmann_rapidminer_2016,
36+
edition = {0},
37+
title = {{RapidMiner}: Data Mining Use Cases and Business Analytics Applications},
38+
isbn = {978-0-429-17109-3},
39+
url = {https://www.taylorfrancis.com/books/9781482205503},
40+
shorttitle = {{RapidMiner}},
41+
publisher = {Chapman and Hall/{CRC}},
42+
editor = {Hofmann, Markus and Klinkenberg, Ralf},
43+
urldate = {2023-12-17},
44+
date = {2016-04-19},
45+
langid = {english},
46+
doi = {10.1201/b16023},
47+
}
48+
49+
@online{noauthor_quest-ce_nodate,
50+
title = {Qu'est-ce que le boosting ? – Le boosting dans le cadre du machine learning expliqué – {AWS}},
51+
url = {https://aws.amazon.com/fr/what-is/boosting/},
52+
shorttitle = {Qu'est-ce que le boosting ?},
53+
abstract = {Découvrez ce qu'est le boosting, comment il fonctionne avec l'{IA}/le {ML} et comment utiliser le boosting dans le cadre du machine learning sur {AWS}.},
54+
titleaddon = {Amazon Web Services, Inc.},
55+
urldate = {2023-12-17},
56+
langid = {french},
57+
}
58+
59+
@article{haixiang_learning_2017,
60+
title = {Learning from class-imbalanced data: Review of methods and applications},
61+
volume = {73},
62+
issn = {0957-4174},
63+
url = {https://www.sciencedirect.com/science/article/pii/S0957417416307175},
64+
doi = {10.1016/j.eswa.2016.12.035},
65+
shorttitle = {Learning from class-imbalanced data},
66+
abstract = {Rare events, especially those that could potentially negatively impact society, often require humans’ decision-making responses. Detecting rare events can be viewed as a prediction task in data mining and machine learning communities. As these events are rarely observed in daily life, the prediction task suffers from a lack of balanced data. In this paper, we provide an in depth review of rare event detection from an imbalanced learning perspective. Five hundred and seventeen related papers that have been published in the past decade were collected for the study. The initial statistics suggested that rare events detection and imbalanced learning are concerned across a wide range of research areas from management science to engineering. We reviewed all collected papers from both a technical and a practical point of view. Modeling methods discussed include techniques such as data preprocessing, classification algorithms and model evaluation. For applications, we first provide a comprehensive taxonomy of the existing application domains of imbalanced learning, and then we detail the applications for each category. Finally, some suggestions from the reviewed papers are incorporated with our experiences and judgments to offer further research directions for the imbalanced learning and rare event detection fields.},
67+
pages = {220--239},
68+
journaltitle = {Expert Systems with Applications},
69+
shortjournal = {Expert Systems with Applications},
70+
author = {Haixiang, Guo and Yijing, Li and Shang, Jennifer and Mingyun, Gu and Yuanyue, Huang and Bing, Gong},
71+
urldate = {2023-12-17},
72+
date = {2017-05-01},
73+
keywords = {Data mining, Imbalanced data, Machine learning, Rare events},
74+
}
75+
76+
@article{galar_review_2012,
77+
title = {A Review on Ensembles for the Class Imbalance Problem: Bagging-, Boosting-, and Hybrid-Based Approaches},
78+
volume = {42},
79+
issn = {1558-2442},
80+
url = {https://ieeexplore.ieee.org/document/5978225},
81+
doi = {10.1109/TSMCC.2011.2161285},
82+
shorttitle = {A Review on Ensembles for the Class Imbalance Problem},
83+
abstract = {Classifier learning with data-sets that suffer from imbalanced class distributions is a challenging problem in data mining community. This issue occurs when the number of examples that represent one class is much lower than the ones of the other classes. Its presence in many real-world applications has brought along a growth of attention from researchers. In machine learning, the ensemble of classifiers are known to increase the accuracy of single classifiers by combining several of them, but neither of these learning techniques alone solve the class imbalance problem, to deal with this issue the ensemble learning algorithms have to be designed specifically. In this paper, our aim is to review the state of the art on ensemble techniques in the framework of imbalanced data-sets, with focus on two-class problems. We propose a taxonomy for ensemble-based methods to address the class imbalance where each proposal can be categorized depending on the inner ensemble methodology in which it is based. In addition, we develop a thorough empirical comparison by the consideration of the most significant published approaches, within the families of the taxonomy proposed, to show whether any of them makes a difference. This comparison has shown the good behavior of the simplest approaches which combine random undersampling techniques with bagging or boosting ensembles. In addition, the positive synergy between sampling techniques and bagging has stood out. Furthermore, our results show empirically that ensemble-based algorithms are worthwhile since they outperform the mere use of preprocessing techniques before learning the classifier, therefore justifying the increase of complexity by means of a significant enhancement of the results.},
84+
pages = {463--484},
85+
number = {4},
86+
journaltitle = {{IEEE} Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews)},
87+
author = {Galar, Mikel and Fernandez, Alberto and Barrenechea, Edurne and Bustince, Humberto and Herrera, Francisco},
88+
urldate = {2023-12-17},
89+
date = {2012-07},
90+
note = {Conference Name: {IEEE} Transactions on Systems, Man, and Cybernetics, Part C (Applications and Reviews)},
91+
}
92+
293
@book{durkheim_suicide_1951,
394
title = {Suicide, a Study in Sociology},
495
isbn = {978-0-02-908660-5},

main.tex

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
\documentclass[conference]{IEEEtran}
22
\IEEEoverridecommandlockouts
3-
% The preceding line is only needed to identify funding in the first footnote. If that is unneeded, please comment it out.
3+
4+
%Loading packages
45
\usepackage{cite}
56
\usepackage{amsmath,amssymb,amsfonts}
67
\usepackage{algorithmic}
@@ -9,12 +10,19 @@
910
\usepackage{xcolor}
1011
\usepackage{float}
1112
\usepackage{subfiles}
13+
\usepackage[toc]{glossaries}
14+
15+
%Style coding
1216
%\restylefloat{table}
1317
\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
1418
T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}
15-
\begin{document}
19+
20+
%Load glossary
21+
\loadglsentries{res/glossaries/abbreviation}
1622

1723

24+
\begin{document}
25+
1826
\title{Data-Driven Admissions in Education: Enhancing Student Success by Matching Profiles to Optimal Academic Paths\\
1927
{\footnotesize \textsuperscript{}}
2028
\thanks{}
@@ -30,18 +38,9 @@
3038
\pagestyle{plain}
3139

3240
\tableofcontents
33-
34-
\section*{abbreviation}
35-
\begin{itemize}
36-
\item[] AI : Artificial Intelligence
37-
\item[] ML : Machine learning
38-
\item[] KNN : K-Nearest Neighbors
39-
\item[] CNN : Convolutional Neural Networks
40-
\item[] RNN Recurrent Neural Networks
41-
\item[] SVM : Support Vector Machines
42-
\item[] RF : Random Forest
43-
\item[] SMOTE : Synthetic Minority Oversampling TEchniques
44-
\end{itemize}
41+
\listoffigures
42+
\listoftables
43+
\printglossary[type=\acronymtype, title=Accronymes, toctitle=Accronymes]
4544

4645
\vspace{16pt}
4746
\begin{abstract}
@@ -87,5 +86,6 @@ \section{Acknowledgment}
8786

8887
\bibliographystyle{plain}
8988
\bibliography{bib/references}
89+
\printglossary
9090

9191
\end{document}

res/diagram/ML Workflow.png

-18.4 KB
Loading

res/glossaries/abbreviation.tex

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
\makeglossaries
2+
3+
\newacronym{ml}{ML}{Machine learning}
4+
5+
\newacronym{ai}{AI}{Artificial Intelligence}
6+
7+
\newacronym{knn}{KNN}{K-Nearest Neighbors}
8+
9+
\newacronym{cnn}{CNN}{Convolutional Neural Networks }
10+
11+
\newacronym{rnn}{RNN}{Recurrent Neural Networks }
12+
13+
\newacronym{svm}{SVM}{Support Vector Machines }
14+
15+
\newacronym{rf}{RF}{Random Forest}
16+
17+
\newacronym{smote}{SMOTE}{Synthetic Minority Oversampling TEchniques}
18+
19+
\newacronym{roc}{ROC}{Receiver operating characteristic}
20+
21+
\newacronym{nn}{NN}{Neural Networks}
22+
23+
\newacronym{acr}{ACR}{Area Under the Curve}
24+
25+
\newacronym{ann}{ANN}{Artificial Neural Networks}
26+
27+
\newacronym{pca}{PCA}{Principal Component Analysis}
28+
29+
\newacronym{dt}{DT}{Decision Tree}
30+
31+
\newacronym{uppa}{UPPA}{Université de Pau et des Pays de l'Adour}

sections/conceptual_proposal.tex

Lines changed: 61 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,19 +2,71 @@
22
\graphicspath{{\subfix{../res/}}}
33
\begin{document}
44

5-
We now need to create our pipeline and workflow before we can start building it. The questions we need to answer are : what data are we going to feed into the pipeline and which algorithm are we going to feed?
6-
It is clear that by the results from our state of the art and analysis that not one algorithm must be used in our workflow to achieve the best result.
7-
To extract as much information and get the possible best results, we have split our system into three inner parts, each with their responsibility, input and output.
8-
But first, let's look into which data we have access to and what to use to feed our system.
5+
As we embark on constructing our pipeline for data-driven student success prediction, we must first delineate the data inputs and algorithmic strategies.
6+
It is clear that by the results from our state of the art and analysis that a multifaceted algorithmic approach is warranted to optimize outcomes
7+
To extract as much information and get the possible best results, we have split our system into three inner parts, each with their responsibility, input, and output.
8+
But first, let's look into which data we have access to, and which we shall determine the pertinent data for ingestion into the system.
99

1010
\subsection{Feeding data}
1111
\label{subsec:conceptualimplementation_feedingdata}
12+
Our literature survey has identified several key factors influencing student retention and success. We can extrapolate and hypothesis such wide factors could be used to determine student's success.
13+
These factors, hypothesized to be critical in predicting student trajectories, are:
1214

13-
\begin{figure*}
15+
\begin{itemize}
16+
\item Family
17+
\item Previous educational background
18+
\item Academic potential
19+
\item Normative congruence
20+
\item Friendship support
21+
\item Intellectual development
22+
\item Educational performance
23+
\item Social integration
24+
\item Satisfaction
25+
\item Institutional commitment
26+
\item Student adaptation
27+
\item Strict School Rules
28+
\end{itemize}
29+
30+
The available dataset for our experiment will be processed to align with these factors, ensuring that each is represented accurately to serve as a foundation for our predictive models.
31+
\begin{itemize}
32+
\item Family
33+
\item Previous educational background
34+
\item Academic potential
35+
\item Normative congruence
36+
\item Friendship support
37+
\item Intellectual development
38+
\item Educational performance
39+
\item Social integration
40+
\item Satisfaction
41+
\item Institutional commitment
42+
\item Student adaptation
43+
\item Strict School Rules
44+
\end{itemize}
45+
46+
47+
\subsection{Data workflow}
48+
\label{subsec:conceptualimplementation_dataworkflow}
49+
50+
Our workflow, as depicted in Figure \ref{fig:dataworkflow}, is designed to systematically transform raw data into actionable insights.
51+
52+
\begin{figure}
1453
\centering
15-
\includegraphics[width=\textwidth]{res//diagram/ML Workflow.png}
16-
\caption{Machine Learning methodology}
17-
\label{fig:Machine workflow}
18-
\end{figure*}
54+
\includegraphics[width=1\linewidth]{res//diagram/ML Workflow.png}
55+
\caption{Algorithmic workflow for data-driven student success prediction.}
56+
\label{fig:dataworkflow}
57+
\end{figure}
58+
59+
Each component of the workflow serves a strategic purpose:
60+
61+
\begin{enumerate}
62+
\item \acrfull{nn}: To model complex non-linear relationships and interactions among the input variables.
63+
\item \acrfull{pca}: For dimensionality reduction, facilitating computational efficiency and data visualization.
64+
\item K-Mean clustering: To identify natural groupings within the student population.
65+
\item Isolation Forest: For anomaly detection, highlighting atypical cases that may require special attention.
66+
\item Lasso Regression: To perform feature selection, enhancing model interoperability by isolating significant predictors.
67+
\end{enumerate}
68+
69+
\subsection{Validation and Expected Outcomes}
70+
The efficacy of our approach will be gauged through rigorous validation techniques, ensuring the reliability of our predictions. We anticipate that this comprehensive workflow will yield a robust predictive model capable of identifying students at risk and informing targeted interventions.
1971

2072
\end{document}

sections/introduction.tex

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,15 +21,15 @@
2121

2222
In France, the Parcoursup system was introduced to manage the influx of candidates and match them with suitable programs. Despite its intentions for uniformity and transparency, it has faced criticism for not adequately addressing the mismatch between student potential and program suitability, which is a contributing factor to the low success rates \cite{couto_parcoursup_2021}.
2323

24-
But what can be categorized as student success, and how to tell if a student enters the profile of excellence? Most of the time, success is, for institute, the number of student who graduate their degree. \cite{weatherton_success_2021}. However, even if this correlation can indicate a good rate of success for an institute or university, is it really indicative of real success? For our research, we have chosen to stay with the simple success definition of how many students can graduate to stay within a binary model for our ML. However, to really answer the problematic of highlighting an excellent student, we will take into account other factors.
24+
But what can be categorized as student success, and how to tell if a student enters the profile of excellence? Most of the time, success is, for institute, the number of student who graduate their degree. \cite{weatherton_success_2021}. However, even if this correlation can indicate a good rate of success for an institute or university, is it really indicative of real success? For our research, we have chosen to stay with the simple success definition of how many students can graduate to stay within a binary model for our \acrfull{ml}. However, to really answer the problematic of highlighting an excellent student, we will take into account other factors.
2525
Another definition we have to pin is the definition of excellent student. By definition, it could be written like so, “an excellent student is one with good grades, a good understanding of the concepts and a general interest in the field of study.” Independent of students with ease for learning, an excellent student may not perform well in a casual course cursus, but out stand in a specific field he or she is interested in.
2626

27-
This research endeavours to explore and validate the potential of ML and data analytic in revolutionizing the admission process. The motivation is twofold: to enhance the success rate of students by ensuring they are placed in programs where they are most likely to excel, and to reduce dropout rates by minimizing mismatches between students and programs. We also thrive to found more excellence students within the mass of registration.
27+
This research endeavours to explore and validate the potential of \acrfull{ml} and data analytic in revolutionizing the admission process. The motivation is twofold: to enhance the success rate of students by ensuring they are placed in programs where they are most likely to excel, and to reduce dropout rates by minimizing mismatches between students and programs. We also thrive to found more excellence students within the mass of registration.
2828

2929
We are going to base our experiments and result on the french academic system. However, we are going from the principal that any academic system could use this research to build such registration helping systems. Because of a lack of literature on the french system, we have extended it to the entire world, including all different academic system from different countries. Because the academic process matters less than the actcual need and hope from a student and institution point of view, we can exploit these different datas for our research. Yet, as researcher and as readers, we suggest that a line been drawn and remember that a big part of this system is the culture of the country in which this system is based. We are using universal factors to feed our system, but some may vary from country to country.
3030

3131
Another point we need to clarify is that this research is not made to discriminate student nor help the "elite" by creating an even bigger chasm in societal problematic. It is in fact a way to reduce this gap and give each student a chance of getting into higher education and earn some sort of diploma that will suit their need and hope.
3232

33-
This research will explore and proposes of different approach, starting with a detailed methodology, followed by a case study made within the University of Pau et des Pays de l'Adour.
33+
This research will explore and proposes of different approach, starting with a detailed methodology, followed by a case study made within the \acrfull{uppa}
3434
We will then conclude, taking into account our findings and the result of our experiment
3535
\end{document}

0 commit comments

Comments
 (0)