fix dratio

grlloyd · grlloyd · commit 611668f7115e · 2023-07-20T10:46:30.000+01:00
- update equation(s) to match paper
- add option to use sd or mad
- add second version of dratio from paper
- add tests
diff --git a/R/d_ratio_filter_class.R b/R/d_ratio_filter_class.R
@@ -4,11 +4,19 @@
 #' M = dratio_filter(threshold=20,qc_label='QC',factor_name='Class')
 #' M = model_apply(M,D)
 #' @export dratio_filter
-dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) {
+dratio_filter = function(
+        threshold=20, 
+        qc_label='QC', 
+        factor_name, 
+        method='ratio',
+        dispersion='sd',
+        ...) {
     out=struct::new_struct('dratio_filter',
         threshold=threshold,
         qc_label=qc_label,
         factor_name=factor_name,
+        method=method,
+        dispersion=dispersion,
         ...)
     return(out)
 }
@@ -21,7 +29,9 @@ dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) {
         factor_name='entity',
         filtered='entity',
         flags='entity',
-        d_ratio='data.frame'
+        d_ratio='data.frame',
+        method='enum',
+        dispersion='enum'
     ),
     prototype=list(name = 'Dispersion ratio filter',
         description = paste0('The dispersion ratio (d-ratio) compares the ',
@@ -33,7 +43,7 @@ dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) {
         'the feature is removed.'),
         type = 'filter',
         predicted = 'filtered',
-        .params=c('threshold','qc_label','factor_name'),
+        .params=c('threshold','qc_label','factor_name','method','dispersion'),
         .outputs=c('filtered','flags','d_ratio'),
         citations=list(
             bibentry(
@@ -75,8 +85,35 @@ dratio_filter = function(threshold=20, qc_label='QC', factor_name, ...) {
             description = 'Flag indicating whether the feature was rejected by the filter or not.',
             type='data.frame',
             value=data.frame()
+        ),
+        method=enum(
+            name='dratio method',
+            description = c(
+                'ratio' = paste0('Dispersion of the QCs divided by the ',
+                'dispersion of the samples. Corresponds to Eq 4 in ',
+                ' Broadhurst et al (2018).'),
+                'euclidean' = paste0('Dispersion of the QCs divided by the ',
+                'euclidean length of the total dispersion. Total dispersion ',
+                'is estimated from the QC and Sample dispersion by assuming ',
+                'that they are orthogonal. Corresponds to Eq 5 in ',
+                'Broadhurst et al (2018)')),
+            allowed=c('ratio','euclidean'),
+            value='ratio',
+            type='character',
+            max_length = 1
+        ),
+        dispersion=enum(
+            name='Dispersion method',
+            description = c(
+                'sd' = paste0('Dispersion is estimated using the ',
+                'standard deviation.'),
+                'mad' = paste0('Dispersion is estimated using the median ',
+                'absolute deviation.')),
+            allowed=c('sd','mad'),
+            value='sd',
+            type='character',
+            max_length = 1
         )
-
     )
 )
 
@@ -86,25 +123,42 @@ setMethod(f="model_train",
     signature=c("dratio_filter","DatasetExperiment"),
     definition=function(M,D)
     {
-        # mad QC samples
+        # dispersion QC samples
         QC = filter_smeta(
                 mode='include',
                 levels=M$qc_label,
                 factor_name=M$factor_name)
         QC = model_apply(QC,D)
         QC = predicted(QC)$data
-        QC = apply(QC,2,mad,na.rm=TRUE)
+        
+        if (M$dispersion=='mad') {
+            QC = apply(QC,2,mad,na.rm=TRUE)
+        } else {
+            QC = apply(QC,2,sd,na.rm=TRUE)
+        }
 
-        # mad not qc samples
+        # dispersion (not QC) samples
         S = filter_smeta(
                 mode='exclude',
                 levels=M$qc_label,
                 factor_name=M$factor_name)
         S = model_apply(S,D)
         S = predicted(S)$data
-        S = apply(S,2,mad,na.rm=TRUE)
+        
+        if (M$dispersion=='mad') {
+            S = apply(S,2,mad,na.rm=TRUE) # constant = 1.4826 default
+        } else {
+            S = apply(S,2,sd,na.rm=TRUE)
+        }
 
-        d_ratio=(QC/(QC+S))*100
+        # dispersion ratio
+        if (M$method=='ratio') {
+            # eq 4
+            d_ratio=(QC/S)*100
+        } else {
+            # eq 5
+            d_ratio= (QC / sqrt((QC^2) + (S^2))) * 100
+        }
 
         OUT=d_ratio>M$threshold
 
diff --git a/man/dratio_filter.Rd b/man/dratio_filter.Rd
diff --git a/tests/testthat/test-dratio-filter.R b/tests/testthat/test-dratio-filter.R
@@ -1,12 +1,119 @@
 # filter d-ratio
-test_that('d-ratio filter works as expected',{
+test_that('d-ratio filter works as expected using sd ratio',{
     set.seed('57475')
     # DatasetExperiment
     DE = MTBLS79_DatasetExperiment(filtered=TRUE)
     M = dratio_filter(
         threshold=20,
         qc_label = 'QC',
-        factor_name='Class'
+        factor_name='Class',
+        method = 'ratio',
+        dispersion = 'sd'
+    )
+    M = model_apply(M,DE)
+    
+    # manually compute d-ratio
+    qc_mad=unlist(
+        lapply(DE$data[DE$sample_meta$Class=='QC',],sd,na.rm=TRUE)
+    )
+    sa_mad=unlist(
+        lapply(DE$data[DE$sample_meta$Class!='QC',],sd,na.rm=TRUE)
+    )
+    dr = (qc_mad/(sa_mad))*100
+    
+    # check manual vs fcn
+    expect_true(all(dr==M$d_ratio$d_ratio))
+    
+    # check values havent changed
+    expect_equal(dr[[1]],23.05762,tolerance = 5e-6)
+    expect_equal(dr[[100]],64.49242,tolerance = 5e-6)
+    
+    # just number of filtered columns
+    expect_true(ncol(predicted(M))==725)
+    expect_true(ncol(DE)-ncol(predicted(M))==854)
+    expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20))
+})
+
+test_that('d-ratio filter works as expected using mad ratio',{
+    set.seed('57475')
+    # DatasetExperiment
+    DE = MTBLS79_DatasetExperiment(filtered=TRUE)
+    M = dratio_filter(
+        threshold=20,
+        qc_label = 'QC',
+        factor_name='Class',
+        method = 'ratio',
+        dispersion = 'mad'
+    )
+    M = model_apply(M,DE)
+    
+    # manually compute d-ratio
+    qc_mad=unlist(
+        lapply(DE$data[DE$sample_meta$Class=='QC',],mad,na.rm=TRUE)
+    )
+    sa_mad=unlist(
+        lapply(DE$data[DE$sample_meta$Class!='QC',],mad,na.rm=TRUE)
+    )
+    dr = (qc_mad/(sa_mad))*100
+    
+    # check manual vs fcn
+    expect_true(all(dr==M$d_ratio$d_ratio))
+    
+    # check values havent changed
+    expect_equal(dr[[1]],14.14758,tolerance = 5e-6)
+    expect_equal(dr[[100]],48.23871,tolerance = 5e-6)
+    
+    # just number of filtered columns
+    expect_true(ncol(predicted(M))==834)
+    expect_true(ncol(DE)-ncol(predicted(M))==745)
+    expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20))
+})
+
+test_that('d-ratio filter works as expected using sd euclidean',{
+    set.seed('57475')
+    # DatasetExperiment
+    DE = MTBLS79_DatasetExperiment(filtered=TRUE)
+    M = dratio_filter(
+        threshold=20,
+        qc_label = 'QC',
+        factor_name='Class',
+        method = 'euclidean',
+        dispersion = 'sd'
+    )
+    M = model_apply(M,DE)
+    
+    # manually compute d-ratio
+    qc_mad=unlist(
+        lapply(DE$data[DE$sample_meta$Class=='QC',],sd,na.rm=TRUE)
+    )
+    sa_mad=unlist(
+        lapply(DE$data[DE$sample_meta$Class!='QC',],sd,na.rm=TRUE)
+    )
+    dr = qc_mad/sqrt((qc_mad^2) + (sa_mad^2))*100
+    
+    # check manual vs fcn
+    expect_true(all(dr==M$d_ratio$d_ratio))
+    
+    # check values havent changed
+    expect_equal(dr[[1]],22.46809,tolerance = 5e-6)
+    expect_equal(dr[[100]],54.19862,tolerance = 5e-6)
+    
+    # just number of filtered columns
+    expect_true(ncol(predicted(M))==747)
+    expect_true(ncol(DE)-ncol(predicted(M))==832)
+    expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20))
+})
+
+test_that('d-ratio filter works as expected using mad euclidean',{
+    set.seed('57475')
+    # DatasetExperiment
+    DE = MTBLS79_DatasetExperiment(filtered=TRUE)
+    M = dratio_filter(
+        threshold=20,
+        qc_label = 'QC',
+        factor_name='Class',
+        method = 'euclidean',
+        dispersion = 'mad'
     )
     M = model_apply(M,DE)
     
@@ -17,17 +124,17 @@ test_that('d-ratio filter works as expected',{
     sa_mad=unlist(
         lapply(DE$data[DE$sample_meta$Class!='QC',],mad,na.rm=TRUE)
     )
-    dr = (qc_mad/(qc_mad+sa_mad))*100
+    dr = qc_mad/sqrt((qc_mad^2) + (sa_mad^2))*100
     
     # check manual vs fcn
     expect_true(all(dr==M$d_ratio$d_ratio))
     
     # check values havent changed
-    expect_equal(dr[[1]],12.39,tolerance = 1e3)
-    expect_equal(dr[[100]],32.54,tolerance = 1e3)
+    expect_equal(dr[[1]],14.00808,tolerance = 5e-6)
+    expect_equal(dr[[100]],43.44776,tolerance = 5e-6)
     
     # just number of filtered columns
-    expect_true(ncol(predicted(M))==1052)
-    expect_true(ncol(DE)-ncol(predicted(M))==527)
+    expect_true(ncol(predicted(M))==850)
+    expect_true(ncol(DE)-ncol(predicted(M))==729)
     expect_true(ncol(DE)-ncol(predicted(M))==sum(dr>20))
 })