version bump 0.8.3

grlloyd · grlloyd · commit 128a3495f9b0 · 2019-11-19T09:40:14.000Z
also force struct version between 0.4.1 and 0.5.0
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: structToolbox
 Type: Package
 Title: Some tools bult using the struct package
-Version: 0.8.2
+Version: 0.8.3
 Author: Gavin Rhys Lloyd
 Maintainer: Gavin Rhys Lloyd <g.r.lloyd@bham.ac.uk>
 Description: Extends the class templates provided by the struct package to provide methods for training PCA, PLS models with cross-validation, permutation testing etc.
@@ -44,6 +44,7 @@ Collate:
     'glog_class.R'
     'grid_search_1d_class.R'
     'hca_class.R'
+    'kfold_xval2_class.R'
     'kfold_xval_class.R'
     'kfold_xval_charts.R'
     'knn_impute_class.R'
@@ -75,7 +76,7 @@ Collate:
     'vec_norm_class.R'
     'wilcox_test_class.R'
     'zzz.R'
-Depends: struct (== 0.4.1)
+Depends: struct (>= 0.4.1), struct(< 0.5.0)
 Imports: ggplot2,
  pmp,
  gridExtra,
@@ -103,7 +104,7 @@ Suggests:
  sbcms,
  Rtsne
 Remotes: computational-metabolomics/pmp,
-  computational-metabolomics/struct@v0.4.1,
+  computational-metabolomics/struct,
   computational-metabolomics/sbcms
 VignetteBuilder: knitr
 biocViews: WorkflowStep
diff --git a/NAMESPACE b/NAMESPACE
@@ -42,6 +42,7 @@ export(grid_search_1d)
 export(gs_line)
 export(hca_dendrogram)
 export(kfold_xval)
+export(kfold_xval2)
 export(kfoldxcv_grid)
 export(kfoldxcv_metric)
 export(knn_impute)
diff --git a/R/PCA_class.R b/R/PCA_class.R
@@ -19,7 +19,8 @@ PCA<-setClass(
         outputs.eigenvalues='data.frame',
         outputs.ssx='numeric',
         outputs.correlation='data.frame',
-        outputs.that='dataset'
+        outputs.that='dataset',
+        outputs.xhat='dataset'
     ),
     prototype = list(name='Principal Component Analysis (PCA)',
         description='PCA is a multivariate data reduction technique. It summarises the data in a smaller number of Principal Components that describe the maximum variation present in the dataset.',
@@ -107,6 +108,12 @@ setMethod(f="model.predict",
         dataset.data(S)=that
         output.value(M,'that')=S
 
+        xhat=as.matrix(that)%*%as.matrix(t(P))
+        xhat=as.data.frame(xhat)
+        rownames(that)=rownames(X)
+        colnames(xhat)=colnames(X)
+        M$xhat=dataset(data=xhat,sample_meta=D$sample_meta,variable_meta=D$variable_meta)
+
         return(M)
     }
 )
diff --git a/R/forward_selection_by_rank_class.R b/R/forward_selection_by_rank_class.R
@@ -167,7 +167,6 @@ eval_loess=function(x,X,Y,k=10,p=0.66)
     # Y = observed values
     # k = number of replicates
     # p = proportion in training
-
     residual=numeric(k)
     for (i in 1:k)
     {
@@ -181,9 +180,16 @@ eval_loess=function(x,X,Y,k=10,p=0.66)
         yy2=Y[X %in% xx2]
 
 
-        loessMod <- loess(yy ~ xx, span=x) # 25% smoothing span
-        smoothed=stats::predict(loessMod,newdata=xx2)
-        residual[i]=sum((smoothed-yy2)^2)
+        loessMod <- loess(yy ~ xx, span=x)
+
+        # check for NaN
+        if (any(is.nan(loessMod$fitted))){
+            residual[i]=99999
+        } else {
+
+            smoothed=stats::predict(loessMod,newdata=xx2)
+            residual[i]=sum((smoothed-yy2)^2)
+        }
     }
     return(sqrt(mean(residual)))
 }
diff --git a/R/kfold_xval2_class.R b/R/kfold_xval2_class.R
@@ -0,0 +1,128 @@
+#' kfold_xval model class
+#'
+#' Applies k-fold crossvalidation to a model or model.seq()
+#' @export kfold_xval2
+#' @examples
+#' I = kfold_xval2()
+kfold_xval2<-setClass(
+    "kfold_xval2",
+    contains='resampler',
+    slots=c(params.folds='numeric',
+        params.method='character',
+        params.factor_name='entity',
+        outputs.metric='data.frame'
+    ),
+    prototype = list(name='k-fold cross-validation',
+        type="resampling",
+        result='metric',
+        params.folds=10,
+        params.method='venetian'
+    )
+)
+
+#' @export
+#' @template run
+setMethod(f="run",
+    signature=c("kfold_xval2",'dataset','metric'),
+    definition=function(I,D,MET=NULL)
+    {
+        X=dataset.data(D)
+
+
+        WF=models(I)
+
+        # venetian 123123123123
+        if (param.value(I,'method')=='venetian')
+        {
+            fold_id=rep(1:param.value(I,'folds'),length.out=nrow(X))
+        } else if (param.value(I,'method')=='blocks')
+        { # blocks 111122223333
+            fold_id=rep(1:param.value(I,'folds'),length.out=nrow(X))
+            fold_id=sort(fold_id)
+        } else if (param.value(I,'method')=='random') {
+            fold_id=rep(1:param.value(I,'folds'),length.out=nrow(X))
+            fold_id=sample(fold_id,length(fold_id),replace = FALSE)
+        } else {
+            stop('unknown method for cross-validation. (try "venetian", "blocks" or "random")')
+        }
+
+        # for each value of k, split the data and run the workflow
+        for (i in 1:param.value(I,'folds'))
+        {
+            # prep the training data
+            TrainX=X[fold_id!=i,,drop=FALSE]
+            TrainY=Y[fold_id!=i,,drop=FALSE]
+            dtrain=dataset(data=TrainX,sample_meta=TrainY)
+
+            TestX=X[fold_id==i,,drop=FALSE]
+            TestY=Y[fold_id==i,,drop=FALSE]
+            dtest=dataset(data=TestX,sample_meta=TestY)
+
+            if (is(WF,'model_OR_model.seq'))
+                # HAS TO BE A model OR model.seq
+            {
+                WF=model.train(WF,dtrain)
+                # apply the model
+                WF=model.predict(WF,dtrain)
+                p=predicted(WF)
+                # metric
+                if (MET@actual=='sample_meta') {
+                    yhat=p
+                } else if (MET@actual=='data') {
+                    yhat=p$data
+                } else {
+                    stop('MET$actual not implemented yet')
+                }
+                YHATtr[fold_id!=i,]=yhat
+
+                # test set
+                WF=model.predict(WF,dtest)
+                p=predicted(WF)
+
+                if (MET@actual=='sample_meta') {
+                    yhat=p
+                } else if (MET@actual=='data') {
+                    yhat=p$data
+                } else {
+                    stop('MET$actual not implemented yet')
+                }
+                YHAT[fold_id==i,]=yhat
+
+
+            } else if (is(WF,'iterator'))
+            {
+                stop('not implemented yet')
+            }
+            # validation set...??
+            # WF=predict(WF,dval)
+            # p=predicted(WF[length(WF)])
+            # val_result[,1]=p[,1]
+
+            #all_results[((nrow(X)*(i-1))+1):(nrow(X)*i),]=fold_results
+        }
+
+        if (MET@actual=='data') {
+            # if its a model sequence get the prediction from the penultimate step
+            # for comparison with the predictions
+            if (is(WF,'model_OR_model.seq')) {
+                # apply model to data
+                WF=model.apply(WF,D)
+                n=length(WF)
+                if (n>1) {# just in case a sequence of 1
+                    Y=predicted(WF[n-1])$data
+                }
+            }
+        }
+
+        # test sets metric
+        df=data.frame('training_set'=0,'test_set'=0,'metric'=class(MET)[[1]])
+        MET=calculate(MET,Y,YHAT)
+        df$training_set=value(MET)
+        # training set metric
+        MET=calculate(MET,Y,YHATtr)
+        df$test_set=value(MET)
+        I$metric=df
+        return(I)
+    }
+)
+
diff --git a/R/permutation_test2_class.R b/R/permutation_test2_class.R
@@ -11,9 +11,8 @@ permutation_test2<-setClass(
     slots=c(
         params.number_of_permutations='numeric',
         params.collect='character',
-        outputs.results.permuted='data.frame',
-        outputs.results.unpermuted='data.frame',
-        outputs.metric='data.frame',
+        outputs.metric_permuted='data.frame',
+        outputs.metric_unpermuted='data.frame',
         outputs.collected='entity'
     ),
     prototype = list(name='permutation test',
@@ -40,9 +39,6 @@ setMethod(f="run",
               WF=models(I)
               n=param.value(I,'number_of_permutations')
 
-              all_results_permuted=data.frame('actual'=rep(y[,1],n),'predicted'=rep(y[,1],n),'permutation'=0)
-              all_results_unpermuted=data.frame('actual'=rep(y[,1],n),'predicted'=rep(y[,1],n),'permutation'=0)
-
               collected=list(permuted=list(),unpermuted=list())
 
               for (i in 1:n)
@@ -142,8 +138,8 @@ setMethod(f="run",
 
               }
               # store results
-              output.value(I,'results.permuted')=all_results_permuted
-              output.value(I,'results.unpermuted')=all_results_unpermuted
+              output.value(I,'metric_permuted')=all_results_permuted
+              output.value(I,'metric_unpermuted')=all_results_unpermuted
               return(I)
           }
 )
diff --git a/R/r_squared_class.R b/R/r_squared_class.R
@@ -19,9 +19,14 @@ setMethod(f="calculate",
     signature=c('r_squared'),
     definition=function(obj,Y,Yhat)
     {
-        SSR  = sum((Yhat-mean(Y))^2)
-        SSE  = sum((Y-Yhat)^2)
-        SSTO = sum((Y-mean(Y))^2)
+
+        M=matrix(colMeans(Y),nrow=1)
+        O=matrix(1,nrow=nrow(Y),ncol=1)
+        M=O %*% M
+
+        SSR  = sum(sum((Yhat-M)^2))
+        SSE  = sum(sum((Y-Yhat)^2))
+        SSTO = sum(sum((Y-M)^2))
 
         R2=1-(SSE/SSTO)
 
diff --git a/man/kfold_xval2-class.Rd b/man/kfold_xval2-class.Rd
diff --git a/man/run.Rd b/man/run.Rd