combine with devel patch_bigsamples

SuiYue-2308 · SuiYue-2308 · commit 99d31f4b1589 · 2025-04-09T13:36:29.000+08:00
Merge branch 'devel' into singleExon_2

# Conflicts:
#	.github/workflows/check-bioc.yml
#	README.md
diff --git a/.github/workflows/check-bioc.yml b/.github/workflows/check-bioc.yml
@@ -54,9 +54,15 @@ jobs:
       fail-fast: false
       matrix:
         config:
+<<<<<<< HEAD
           - { os: ubuntu-latest, r: '4.4', bioc: '3.19', cont: "bioconductor/bioconductor_docker:RELEASE_3_19", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
           - { os: macOS-latest, r: '4.4', bioc: '3.19'}
           - { os: windows-latest, r: '4.4', bioc: '3.19'}
+=======
+          - { os: ubuntu-latest, r: '4.4.2', bioc: '3.20', cont: "bioconductor/bioconductor_docker:RELEASE_3_20", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
+          - { os: macOS-latest, r: '4.4.2', bioc: '3.20'}
+            ##- { os: windows-latest, r: '4.3', bioc: '3.18'}
+>>>>>>> devel
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -107,16 +113,16 @@ jobs:
         uses: actions/cache@v4
         with:
           path: ${{ env.R_LIBS_USER }}
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.3-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.3-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.4.2-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.4.2-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
         uses: actions/cache@v4
         with:
           path: /home/runner/work/_temp/Library
-          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-${{ hashFiles('.github/depends.Rds') }}
-          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.3-
+          key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4.2-${{ hashFiles('.github/depends.Rds') }}
+          restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4.2-
 
       - name: Install Linux system dependencies
         if: runner.os == 'Linux'
@@ -339,7 +345,11 @@ jobs:
         if: failure()
         uses: actions/upload-artifact@v4
         with:
+<<<<<<< HEAD
           name: ${{ runner.os }}-biocversion-RELEASE-r-4.4-results
+=======
+          name: ${{ runner.os }}-biocversion-RELEASE-r-4.4.2-results
+>>>>>>> devel
           path: check
 
       - uses: docker/build-push-action@v1
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: bambu
 Type: Package
 Title: Context-Aware Transcript Quantification from Long Read RNA-Seq data
-Version: 3.5.1
+Version: 3.9.3
 Authors@R: c(person("Ying", "Chen", role = c("cre","aut"),
              email = "chen_ying@gis.a-star.edu.sg"),
              person("Andre", "Sim", role = "aut",
diff --git a/R/bambu-extendAnnotations-utilityCombine.R b/R/bambu-extendAnnotations-utilityCombine.R
@@ -17,7 +17,7 @@ isore.combineTranscriptCandidates <- function(readClassList,
     combinedSplicedTranscripts <- 
         combineSplicedTranscriptModels(readClassList, bpParameters, 
         min.readCount, min.readFractionByGene, 
-        min.txScore.multiExon, min.txScore.singleExon, verbose) %>% data.table()
+        min.txScore.multiExon, min.txScore.singleExon, verbose)
     combinedSplicedTranscripts[,confidenceType := "highConfidenceJunctionReads"]
     # when single exon min score is greater than 1, skip unspliced transcripts combination
     # this is a very customized config, useful when data is very big 
@@ -92,40 +92,34 @@ sequentialCombineFeatureTibble <- function(readClassList,
 
 #' @noRd 
 updateStartEndReadCount <- function(combinedFeatureTibble){
-    combinedFeatureTibble <- combinedFeatureTibble %>% 
-        mutate(rowID = row_number())
-    
-    startEndCountTibble <- combinedFeatureTibble %>% 
-        select(rowID, starts_with("start"),starts_with("end"),
-            starts_with("readCount")) %>%
-        tidyr::pivot_longer(c(starts_with("start"),starts_with("end"),
-            starts_with("readCount")), names_to = c(".value","set"),
-            names_pattern = "(.*)\\.(.)") %>%
-        group_by(rowID) %>% 
-        mutate(sumReadCount = sum(readCount,na.rm = TRUE))
+    setDT(combinedFeatureTibble)
+    combinedFeatureTibble[, rowID := .I]
     
-    startTibble <- select(startEndCountTibble, rowID, start, readCount, 
-        sumReadCount) %>% 
-        arrange(start) %>%
-        filter(cumsum(readCount)/sumReadCount>=0.5) %>% 
-        filter(row_number()==1)
-    endTibble <- select(startEndCountTibble, rowID, end, readCount, 
-        sumReadCount) %>% 
-        arrange(end) %>% 
-        filter(cumsum(readCount)/sumReadCount>=0.5) %>% 
-        filter(row_number()==1)
+    colNames <- colnames(combinedFeatureTibble)
+    readCountCols <- sort(colNames[grep("^readCount", colNames)]) # to make sure it's ordered by sample name
+    startCols <- sort(colNames[grep("^start", colNames)])
+    endCols <- sort(colNames[grep("^end", colNames)])
     
-    combinedFeatureTibble <- combinedFeatureTibble %>% 
-        dplyr::select(intronStarts, intronEnds, chr, strand, maxTxScore, 
-            maxTxScore.noFit, NSampleReadCount, NSampleReadProp, 
-            NSampleTxScore, rowID) %>%
-        full_join(select(startTibble, rowID, start), by = "rowID") %>% 
-        full_join(select(endTibble, rowID, end, readCount=sumReadCount), 
-        by = "rowID") %>%
-        select(-rowID)
+    startEndDt <- combinedFeatureTibble[, 
+        .(start = readCountWeightedMedian(.SD,x,y),
+        end = readCountWeightedMedian(.SD,z,y),
+        readCount = sum(.SD[,y], na.rm = TRUE)),
+        by = rowID,  env = I(list(x = startCols, y = readCountCols,z = endCols))]
+
+    combinedFeatureTibble <- startEndDt[combinedFeatureTibble[,.(intronStarts, intronEnds, chr, strand, maxTxScore, 
+                                                                 maxTxScore.noFit, NSampleReadCount, NSampleReadProp, 
+                                                                 NSampleTxScore, rowID)], on = "rowID"]
+    combinedFeatureTibble[, rowID := NULL]
     return(combinedFeatureTibble)
 }
 
+#' Function to get median value without interpolation using certain column names
+#' @noRd
+readCountWeightedMedian <- function(dt, valuevar, timesvar){
+    sortVector <- rep(na.omit(unlist(dt[,..valuevar])), 
+                times = as.integer(na.omit(unlist(dt[,..timesvar]))))
+    return(min(sortVector[sortVector>=quantile(sortVector, probs = 0.5)]))
+}
 
 
 #' Function to combine featureTibble and create the NSample variables 
diff --git a/README.md b/README.md
@@ -679,6 +679,7 @@ metadata(rowRanges(se))$warnings
 
 ### Release History
 
+<<<<<<< HEAD
 **bambu v3.9.0**
 
 Release date: 2025-xxx-xx
@@ -715,6 +716,17 @@ Minor changes:
 - Restore fusion mode functionality and added documentation
 - Fixed bug in plot function
 - Update release history
+=======
+**bambu v3.8.2**
+
+Release date: 2025-02-06
+
+Minor changes:
+
+- Fix large number of samples [issue](https://github.com/GoekeLab/bambu/issues/450)  
+- Fix denovo bug issue 
+
+>>>>>>> devel
 
 **bambu v3.2.5**
 
diff --git a/inst/extdata/seIsoReCombined_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds b/inst/extdata/seIsoReCombined_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds
diff --git a/inst/extdata/seIsoReRef_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds b/inst/extdata/seIsoReRef_SGNex_A549_directRNA_replicate5_run1_chr9_1_1000000.rds
diff --git a/tests/testthat/test_isore.R b/tests/testthat/test_isore.R
@@ -71,8 +71,8 @@ test_that("isore.combineTranscriptCandidates completes successfully", {
     
     expect_equal(seIsoReCombined, seIsoReCombinedExpected)
     expect_named(seIsoReCombined,
-                 c('intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',
-                   'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore', 'start', 'end', 'readCount', 'confidenceType') 
+                 c('start', 'end', 'readCount','intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',
+                   'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore',  'confidenceType') 
     )
 })
 

Original file line number	Diff line number	Diff line change
`@@ -71,8 +71,8 @@ test_that("isore.combineTranscriptCandidates completes successfully", {`
`71`	`71`
`72`	`72`	`expect_equal(seIsoReCombined, seIsoReCombinedExpected)`
`73`	`73`	`expect_named(seIsoReCombined,`
`74`		`- c('intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',`
`75`		`- 'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore', 'start', 'end', 'readCount', 'confidenceType')`
	`74`	`+ c('start', 'end', 'readCount','intronStarts', 'intronEnds', 'chr', 'strand', 'maxTxScore', 'maxTxScore.noFit',`
	`75`	`+ 'NSampleReadCount', 'NSampleReadProp', 'NSampleTxScore', 'confidenceType')`
`76`	`76`	`)`
`77`	`77`	`})`
`78`	`78`