-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathXGB_Baseline.R
More file actions
113 lines (83 loc) · 2.27 KB
/
XGB_Baseline.R
File metadata and controls
113 lines (83 loc) · 2.27 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#Kaggle Competition
#clean memory
rm(list = ls())
gc()
#Load the libraries
require(caret)
require(Matrix)
require(xgboost)
require(data.table)
require(Metrics)
require(scales)
#read the data set and convert the factors to numbers
setwd("d:/Kaggle Competition/")
train=fread("train.csv")
test=fread("test.csv")
# save id & loss
final=data.table(id=test$id)
train_Y=train$loss
#remove id and loss & cont12
train[,c("id","loss"):=NULL]
test[,c("id"):=NULL]
data_comb=rbind(train,test)
indx=grep("cat",names(data_comb),value = T)
for(i in indx)
{
data_comb[[i]]=as.integer(as.factor(data_comb[[i]]))
data_comb[[i]]=rescale(data_comb[[i]])
}
#remove high correlation variables
corr_mat=cor(data_comb)
(high_corr=findCorrelation(corr_mat,.90,names = T))
data_comb=data_comb[,c(high_corr):=NULL]
#simple log
train_Y=log(200+train_Y)
#create xgb model
dtrain <- xgb.DMatrix(data = as.matrix(data_comb[1:length(train_Y),]),label = train_Y)
dtest <- xgb.DMatrix(data = as.matrix(data_comb[(length(train_Y)+1):nrow(data_comb),]))
watchlist=list(train=dtrain)
logregobj <- function(preds, dtrain){
labels = getinfo(dtrain, "label")
con = 2
x = preds-labels
grad =con*x / (abs(x)+con)
hess =con^2 / (abs(x)+con)^2
return (list(grad = grad, hess = hess))
}
param=list(objective = logregobj,
eta=.005,
max_depth= 10,
subsample=.8,
colsample_bytree=.5,
min_child_weight=1,
base_score=7,
alpha=1,
gamma=1
)
xg_eval_mae <- function (yhat, dtrain) {
y = getinfo(dtrain, "label")
err= mae(exp(y),exp(yhat))
return (list(metric = "error", value = err))
}
xgb= xgb.cv(params=param,
dtrain,
nrounds=2000,
nfold=5,
early_stopping_rounds=15,
print_every_n = 10,
verbose= 1,
feval=xg_eval_mae,
#watchlist=watchlist,
maximize=FALSE
)
set.seed(0)
xgb_final=xgb.train(params = param,data = dtrain,watchlist=watchlist,
print_every_n = 100,feval=xg_eval_mae,nrounds = 5000)
#predict
xgb_pred=predict(xgb_final,dtest)
#rescale the variable and anti-log
xgb_pred=exp(xgb_pred)-200
summary(xgb_pred)
final$loss=xgb_pred
#write file to disk
write.csv(final,"18.csv",row.names = F)