3
3
# ' @include PLSR_class.R
4
4
# ' @examples
5
5
# ' M = PLSDA('number_components'=2,factor_name='Species')
6
- PLSDA = function (number_components = 2 ,factor_name ,... ) {
6
+ PLSDA = function (number_components = 2 ,factor_name ,pred_method = ' max_prob ' , ... ) {
7
7
out = struct :: new_struct(' PLSDA' ,
8
- number_components = number_components ,
9
- factor_name = factor_name ,
10
- ... )
8
+ number_components = number_components ,
9
+ factor_name = factor_name ,
10
+ pred_method = pred_method ,
11
+ ... )
11
12
return (out )
12
13
}
13
14
@@ -29,19 +30,24 @@ PLSDA = function(number_components=2,factor_name,...) {
29
30
pred = ' data.frame' ,
30
31
threshold = ' numeric' ,
31
32
sr = ' entity' ,
32
- sr_pvalue = ' entity'
33
+ sr_pvalue = ' entity' ,
34
+ pred_method = ' entity'
33
35
34
36
),
35
- prototype = list (name = ' Partial least squares discriminant analysis' ,
37
+ prototype = list (
38
+ name = ' Partial least squares discriminant analysis' ,
36
39
type = " classification" ,
37
40
predicted = ' pred' ,
38
41
libraries = ' pls' ,
39
- description = paste0(' PLS is a multivariate regression technique that ' ,
42
+ description = paste0(
43
+ ' PLS is a multivariate regression technique that ' ,
40
44
' extracts latent variables maximising covariance between the input ' ,
41
45
' data and the response. The Discriminant Analysis variant uses group ' ,
42
- ' labels in the response variable and applies a threshold to the ' ,
43
- ' predicted values in order to predict group membership for new samples.' ),
44
- .params = c(' number_components' ,' factor_name' ),
46
+ ' labels in the response variable. For >2 groups a 1-vs-all ' ,
47
+ ' approach is used. Group membership can be predicted for test ' ,
48
+ ' samples based on a probability estimate of group membership, ' ,
49
+ ' or the estimated y-value.' ),
50
+ .params = c(' number_components' ,' factor_name' ,' pred_method' ),
45
51
.outputs = c(
46
52
' scores' ,
47
53
' loadings' ,
@@ -57,12 +63,28 @@ PLSDA = function(number_components=2,factor_name,...) {
57
63
' sr' ,
58
64
' sr_pvalue' ),
59
65
60
- number_components = entity(value = 2 ,
66
+ number_components = entity(
67
+ value = 2 ,
61
68
name = ' Number of components' ,
62
69
description = ' The number of PLS components' ,
63
70
type = c(' numeric' ,' integer' )
64
71
),
65
72
factor_name = ents $ factor_name ,
73
+ pred_method = enum(
74
+ name = ' Prediction method' ,
75
+ description = c(
76
+ ' max_yhat' =
77
+ paste0(' The predicted group is selected based on the ' ,
78
+ ' largest value of y_hat.' ),
79
+ ' max_prob' =
80
+ paste0(' The predicted group is selected based on the ' ,
81
+ ' largest probability of group membership.' )
82
+ ),
83
+ value = ' max_prob' ,
84
+ allowed = c(' max_yhat' ,' max_prob' ),
85
+ type = ' character' ,
86
+ max_length = 1
87
+ ),
66
88
sr = entity(
67
89
name = ' Selectivity ratio' ,
68
90
description = paste0(
@@ -92,8 +114,8 @@ PLSDA = function(number_components=2,factor_name,...) {
92
114
pages = ' 122-128' ,
93
115
author = as.person(" Nestor F. Perez and Joan Ferre and Ricard Boque" ),
94
116
title = paste0(' Calculation of the reliability of ' ,
95
- ' classification in discriminant partial least-squares ' ,
96
- ' binary classification' ),
117
+ ' classification in discriminant partial least-squares ' ,
118
+ ' binary classification' ),
97
119
journal = " Chemometrics and Intelligent Laboratory Systems"
98
120
),
99
121
bibentry(
@@ -113,80 +135,83 @@ PLSDA = function(number_components=2,factor_name,...) {
113
135
# ' @export
114
136
# ' @template model_train
115
137
setMethod (f="model_train ",
116
- signature = c(" PLSDA" ,' DatasetExperiment' ),
117
- definition = function (M ,D )
118
- {
119
- SM = D $ sample_meta
120
- y = SM [[M $ factor_name ]]
121
- # convert the factor to a design matrix
122
- z = model.matrix(~ y + 0 )
123
- z [z == 0 ]= - 1 # +/-1 for PLS
124
-
125
- X = as.matrix(D $ data ) # convert X to matrix
126
-
127
- Z = as.data.frame(z )
128
- colnames(Z )= as.character(interaction(' PLSDA' ,1 : ncol(Z ),sep = ' _' ))
129
-
130
- D $ sample_meta = cbind(D $ sample_meta ,Z )
131
-
132
- # PLSR model
133
- N = PLSR(number_components = M $ number_components ,factor_name = colnames(Z ))
134
- N = model_apply(N ,D )
135
-
136
- # copy outputs across
137
- output_list(M ) = output_list(N )
138
-
139
- # some specific outputs for PLSDA
140
- output_value(M ,' design_matrix' )= Z
141
- output_value(M ,' y' )= D $ sample_meta [,M $ factor_name ,drop = FALSE ]
142
-
143
- # for PLSDA compute probabilities
144
- probs = prob(as.matrix(M $ yhat ),as.matrix(M $ yhat ),D $ sample_meta [[M $ factor_name ]])
145
- output_value(M ,' probability' )= as.data.frame(probs $ ingroup )
146
- output_value(M ,' threshold' )= probs $ threshold
147
-
148
- # update column names for outputs
149
- colnames(M $ reg_coeff )= levels(y )
150
- colnames(M $ sr )= levels(y )
151
- colnames(M $ vip )= levels(y )
152
- colnames(M $ yhat )= levels(y )
153
- colnames(M $ design_matrix )= levels(y )
154
- colnames(M $ probability )= levels(y )
155
- names(M $ threshold )= levels(y )
156
- colnames(M $ sr_pvalue )= levels(y )
157
-
158
- return (M )
159
- }
138
+ signature = c(" PLSDA" ,' DatasetExperiment' ),
139
+ definition = function (M ,D )
140
+ {
141
+ SM = D $ sample_meta
142
+ y = SM [[M $ factor_name ]]
143
+ # convert the factor to a design matrix
144
+ z = model.matrix(~ y + 0 )
145
+ z [z == 0 ]= - 1 # +/-1 for PLS
146
+
147
+ X = as.matrix(D $ data ) # convert X to matrix
148
+
149
+ Z = as.data.frame(z )
150
+ colnames(Z )= as.character(interaction(' PLSDA' ,1 : ncol(Z ),sep = ' _' ))
151
+
152
+ D $ sample_meta = cbind(D $ sample_meta ,Z )
153
+
154
+ # PLSR model
155
+ N = PLSR(number_components = M $ number_components ,factor_name = colnames(Z ))
156
+ N = model_apply(N ,D )
157
+
158
+ # copy outputs across
159
+ output_list(M ) = output_list(N )
160
+
161
+ # some specific outputs for PLSDA
162
+ output_value(M ,' design_matrix' )= Z
163
+ output_value(M ,' y' )= D $ sample_meta [,M $ factor_name ,drop = FALSE ]
164
+
165
+ # for PLSDA compute probabilities
166
+ probs = prob(as.matrix(M $ yhat ),as.matrix(M $ yhat ),D $ sample_meta [[M $ factor_name ]])
167
+ output_value(M ,' probability' )= as.data.frame(probs $ ingroup )
168
+ output_value(M ,' threshold' )= probs $ threshold
169
+
170
+ # update column names for outputs
171
+ colnames(M $ reg_coeff )= levels(y )
172
+ colnames(M $ sr )= levels(y )
173
+ colnames(M $ vip )= levels(y )
174
+ colnames(M $ yhat )= levels(y )
175
+ colnames(M $ design_matrix )= levels(y )
176
+ colnames(M $ probability )= levels(y )
177
+ names(M $ threshold )= levels(y )
178
+ colnames(M $ sr_pvalue )= levels(y )
179
+
180
+ return (M )
181
+ }
160
182
)
161
183
162
184
# ' @export
163
185
# ' @template model_predict
164
186
setMethod (f="model_predict ",
165
- signature = c(" PLSDA" ,' DatasetExperiment' ),
166
- definition = function (M ,D )
167
- {
168
- # call PLSR predict
169
- N = callNextMethod(M ,D )
170
- SM = N $ y
171
-
172
- # # probability estimate
173
- # http://www.eigenvector.com/faq/index.php?id=38%7C
174
- p = as.matrix(N $ pred )
175
- d = prob(x = p ,yhat = as.matrix(N $ yhat ),ytrue = SM [[M $ factor_name ]])
176
- pred = (p > d $ threshold )* 1
177
- pred = apply(pred ,MARGIN = 1 ,FUN = which.max )
178
- hi = apply(d $ ingroup ,MARGIN = 1 ,FUN = which.max ) # max probability
179
- if (sum(is.na(pred )> 0 )) {
180
- pred [is.na(pred )]= hi [is.na(pred )] # if none above threshold, use group with highest probability
181
- }
182
- pred = factor (pred ,levels = 1 : nlevels(SM [[M $ factor_name ]]),labels = levels(SM [[M $ factor_name ]])) # make sure pred has all the levels of y
183
- q = data.frame (" pred" = pred )
184
- output_value(M ,' pred' )= q
185
- return (M )
186
- }
187
+ signature = c(" PLSDA" ,' DatasetExperiment' ),
188
+ definition = function (M ,D )
189
+ {
190
+ # call PLSR predict
191
+ N = callNextMethod(M ,D )
192
+ SM = N $ y
193
+
194
+ # # probability estimate
195
+ # http://www.eigenvector.com/faq/index.php?id=38%7C
196
+ p = as.matrix(N $ pred )
197
+ d = prob(x = p ,yhat = as.matrix(N $ yhat ),ytrue = M $ y [[M $ factor_name ]])
198
+
199
+ # predictions
200
+ if (M $ pred_method == ' max_yhat' ) {
201
+ pred = apply(p ,MARGIN = 1 ,FUN = which.max )
202
+ } else if (M $ pred_method == ' max_prob' ) {
203
+ pred = apply(d $ ingroup ,MARGIN = 1 ,FUN = which.max )
204
+ }
205
+ pred = factor (pred ,levels = 1 : nlevels(SM [[M $ factor_name ]]),labels = levels(SM [[M $ factor_name ]])) # make sure pred has all the levels of y
206
+ q = data.frame (" pred" = pred )
207
+ output_value(M ,' pred' )= q
208
+ return (M )
209
+ }
187
210
)
188
211
189
212
213
+
214
+
190
215
prob = function (x ,yhat ,ytrue )
191
216
{
192
217
# x is predicted values
@@ -250,8 +275,7 @@ prob=function(x,yhat,ytrue)
250
275
}
251
276
252
277
253
- gauss_intersect = function (m1 ,m2 ,s1 ,s2 )
254
- {
278
+ gauss_intersect = function (m1 ,m2 ,s1 ,s2 ) {
255
279
# https://stackoverflow.com/questions/22579434/python-finding-the-intersection-point-of-two-gaussian-curves
256
280
a = (1 / (2 * s1 * s1 ))- (1 / (2 * s2 * s2 ))
257
281
b = (m2 / (s2 * s2 )) - (m1 / (s1 * s1 ))
0 commit comments