Data-Science/R code at main · MsAsiat/Data-Science · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
# How many datasets where recorded?
data <- read.csv("CETM72 Assignment Dataset.csv") # Loading Data
View(data)

sapply(data, function(x)(sum(is.na(x)))) # Counting the variables any missing!
names(data) # Variables names

dim(data) # Data Dimension
summary(data) # Summary data

# Renaming the columns for the ease of use
names(data)[c(3)] <- c("CAT4.score") # Cognitive Abilities Test (CAT4) Level
View(data)
names(data)[c(4)] <- c("home.grown") # Length of Enrolment_Has the student been enrolled long back
View(data)
names(data)[c(5)] <- c("study_materials") # Student efforts and attempt on course materials.
View(data)

id <- rownames(data) # Generate id column
data <- cbind(id=id, data)
View(data)

dim(data) # Dimension & summary of variables
summary(data[,"math.score", drop=FALSE], statistics=c("mean", "sd", "IQR",
"quantiles", "skewness", "kurtosis"), quantiles=c(0,.25,.5,.75,1), type="2")
summary(data[,"science.score", drop=FALSE], statistics=c("mean", "sd", "IQR",
"quantiles", "skewness", "kurtosis"), quantiles=c(0,.25,.5,.75,1), type="2")
summary(data[,"english.score", drop=FALSE], statistics=c("mean", "sd", "IQR",
"quantiles", "skewness", "kurtosis"), quantiles=c(0,.25,.5,.75,1), type="2")

###################### DATA VISUALIZATION ######################
# Does attempting the End of year study material affect student’s scores?

boxplot(math.score~study_materials,data=data, col=rainbow(7), xlab="
study_materials", ylab="Math Score") # Maths study materials
boxplot(english.score~study_materials,data=data, col=rainbow(9), xlab="
study_materials ", ylab="English Score") # English study materials
boxplot(science.score~study_materials,data=data, col=rainbow(9), xlab="
study_materials ", ylab="science Score") # Science study materials

# Does gender influence student Performance in Exams?
boxplot(math.score~gender,data=data, col=rainbow(7), xlab=" Gender", ylab="Math
Score")
boxplot(english.score~gender,data=data, col=rainbow(9), xlab=" Gender", ylab="English
Score")
boxplot(science.score~gender,data=data, col=rainbow(9), xlab=" Gender", ylab="science
Score")

# Does student’s Cognitive Abilities Test (CAT4) result influence their Performance in Exams?
boxplot(math.score~CAT4.score,data=data, col=rainbow(7), xlab=" CAT4.score",  ylab="Math Score")
boxplot(english.score~CAT4.score,data=data, col=rainbow(9), xlab=" CAT4.score", ylab="English Score")
boxplot(science.score~CAT4.score,data=data, col=rainbow(9), xlab=" CAT4.score", ylab="science Score")


# Correlation of Scores
ggplot(data, aes(x = math.score, y =english.score)) + geom_point() # math.score & english.sccore.
ggplot(data, aes(x =science.score, y = english.score)) + geom_point() #science.score & english.score
ggplot(data, aes(x =science.score, y = math.score)) + geom_point() #science.score &math.score

###################### REGRESSION MODEL #########################
data[1]<- NULL
#Creating Testing and Training datasets.
set.seed(1)
sample = sample(nrow(data), floor(nrow(data) * 0.8))
train = data[sample,]
test = data[-sample,]

#Cheking dimension of training and test data
dim(train)
dim(test)
#Creating model with training data for prediction of math score
modelM <- lm(math.score~. , data=data)
model_math = lm(math.score~., data = train)
summary(model_math)

pred_math = predict(model_math, newdata = test)
sqrt(mean((test$math.score - pred_math)^2))
summary(modelM)

#Creating model for predicting English score
modelE <- lm(english.score~. , data=data)

model_english = lm(english.score~., data = train)
summary(model_english)
pred_english = predict(model_english, newdata = test)
sqrt(mean((test$english.score - pred_english)^2))
summary(modelE)

#Creating model for predicting science score.
modelS <- lm(science.score~. , data=data)
model_science = lm(science.score~., data = train)
summary(model_science)
pred_science = predict(model_science, newdata = test)
sqrt(mean((test$science.score - pred_science)^2))
summary(modelS)