GoekeLab
diff --git a/‎.github/workflows/check-bioc.yml‎
Lines changed: 5 additions & 5 deletions b/‎.github/workflows/check-bioc.yml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎.github/workflows/lint.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/lint.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/pr-commands.yaml‎
Lines changed: 2 additions & 2 deletions b/‎.github/workflows/pr-commands.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎.github/workflows/test-coverage.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/test-coverage.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DESCRIPTION‎
Lines changed: 2 additions & 1 deletion b/‎DESCRIPTION‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎NAMESPACE‎
Lines changed: 4 additions & 1 deletion b/‎NAMESPACE‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎R/bambu-assignDist.R‎
Lines changed: 110 additions & 0 deletions b/‎R/bambu-assignDist.R‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎R/bambu-extendAnnotations-utilityCombine.R‎
Lines changed: 9 additions & 5 deletions b/‎R/bambu-extendAnnotations-utilityCombine.R‎
Lines changed: 9 additions & 5 deletions
@@ -40,7 +40,7 @@ env:
   run_covr: 'false'
   run_pkgdown: 'false'
   has_RUnit: 'false'
-  cache-version: 'cache-v3'
+  cache-version: 'cache-v4'
   run_docker: 'false'
 
 jobs:
@@ -56,7 +56,7 @@ jobs:
         config:
           - { os: ubuntu-latest, r: '4.4.2', bioc: '3.20', cont: "bioconductor/bioconductor_docker:RELEASE_3_20", rspm: "https://packagemanager.rstudio.com/cran/__linux__/focal/latest" }
           - { os: macOS-latest, r: '4.4.2', bioc: '3.20'}
-            ##- { os: windows-latest, r: '4.3', bioc: '3.18'}
+          ## - { os: windows-latest, r: '4.4', bioc: '3.20'}
           ## Check https://github.com/r-lib/actions/tree/master/examples
           ## for examples using the http-user-agent
     env:
@@ -81,7 +81,7 @@ jobs:
       ## https://github.com/r-lib/actions/blob/master/examples/check-standard.yaml
       ## If they update their steps, we will also need to update ours.
       - name: Checkout Repository
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
 
       ## R is already included in the Bioconductor docker images
       - name: Setup R from r-lib
@@ -104,15 +104,15 @@ jobs:
 
       - name: Restore R package cache
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os != 'Linux'"
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ env.R_LIBS_USER }}
           key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.4.2-${{ hashFiles('.github/depends.Rds') }}
           restore-keys: ${{ env.cache-version }}-${{ runner.os }}-biocversion-RELEASE-r-4.4.2-
 
       - name: Cache R packages on Linux
         if: "!contains(github.event.head_commit.message, '/nocache') && runner.os == 'Linux' "
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: /home/runner/work/_temp/Library
           key: ${{ env.cache-version }}-${{ runner.os }}-biocversion-devel-r-4.4.2-${{ hashFiles('.github/depends.Rds') }}
 
@@ -15,7 +15,7 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-r@v2
         with:
 
@@ -14,7 +14,7 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/pr-fetch@v2
         with:
@@ -49,7 +49,7 @@ jobs:
     env:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/pr-fetch@v2
         with:
 
@@ -16,7 +16,7 @@ jobs:
       GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }}
 
     steps:
-      - uses: actions/checkout@v3
+      - uses: actions/checkout@v4
 
       - uses: r-lib/actions/setup-r@v2
         with:
 
@@ -86,7 +86,8 @@ Imports:
     Rsamtools,
     methods,
     Rcpp,
-    xgboost
+    xgboost,
+    Matrix
 VignetteBuilder: 
     knitr
 LazyData: true
@@ -7,7 +7,9 @@ export(readFromGTF)
 export(transcriptToGeneExpression)
 export(writeBambuOutput)
 export(writeToGTF)
+export(writeAnnotationsToGTF)
 export(trainBambu)
+export(setNDR)
 export(compareTranscripts)
 importFrom(stats,predict)
 importFrom(BiocGenerics,basename)
@@ -73,7 +75,8 @@ import(data.table, except=c(last, first, shift, second, between))
 import(dplyr, except=c(last, first, desc, union, setdiff, intersect, slice))
 import(IRanges, except=c(slice, collapse, setdiff, intersect,cor))
 import(SummarizedExperiment)
-import(S4Vectors, except=c(rename, setequal, setdiff, intersect,cor))
+import(Matrix)
+import(S4Vectors, except=c(rename, setequal, setdiff, intersect,cor, unname, expand))
 useDynLib(bambu, .registration = TRUE)
 import(xgboost)
 import(BSgenome)
@@ -0,0 +1,110 @@
+#' Create equivilence classes and assign to transcripts
+#' @inheritParams bambu
+#' @import data.table
+#' @noRd
+assignReadClasstoTranscripts <- function(readClassList, annotations, isoreParameters, 
+                                        verbose, demultiplexed, spatial, 
+                                        returnDistTable = FALSE, trackReads = TRUE) {
+    if (is.character(readClassList)) readClassList <- readRDS(file = readClassList)
+    metadata(readClassList)$readClassDist <- calculateDistTable(readClassList, annotations, isoreParameters, verbose, returnDistTable)
+    readClassList <- splitReadClassFiles(readClassList)
+    readClassDt <- genEquiRCs(metadata(readClassList)$readClassDist, annotations, verbose) 
+    readClassDt$eqClass.match = match(readClassDt$eqClassById,metadata(readClassList)$eqClassById)
+    readClassDt <- simplifyNames(readClassDt)
+    readClassDt <- readClassDt %>% group_by(eqClassId, gene_sid) %>% 
+        mutate(multi_align = length(unique(txid))>1) %>% 
+        ungroup() %>% 
+        mutate(aval = 1) %>%
+        data.table()
+    #return non-em counts
+    ColData <- generateColData(colnames(metadata(readClassList)$countMatrix), clusters = NULL, demultiplexed, spatial)
+    quantData <- SummarizedExperiment(assays = SimpleList(
+        counts = generateUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)),
+        rowRanges = annotations,
+        colData = ColData)
+    colnames(quantData) <- ColData$id
+    if(sum(metadata(readClassList)$incompatibleCountMatrix)==0){
+        metadata(quantData)$incompatibleCounts <- NULL
+    }else{
+        metadata(quantData)$incompatibleCounts <- generateIncompatibleCounts(metadata(readClassList)$incompatibleCountMatrix, annotations)       
+    }
+    metadata(quantData)$nonuniqueCounts <- generateNonUniqueCounts(readClassDt, metadata(readClassList)$countMatrix, annotations)
+    metadata(quantData)$readClassDt <- readClassDt
+    metadata(quantData)$countMatrix <- metadata(readClassList)$countMatrix
+    metadata(quantData)$incompatibleCountMatrix <- metadata(readClassList)$incompatibleCountMatrix 
+    metadata(quantData)$sampleNames <- metadata(readClassList)$sampleNames 
+    if(returnDistTable)
+        metadata(quantData)$distTable <- metadata(metadata(readClassList)$readClassDist)$distTableOld
+
+    if(trackReads)
+        metadata(quantData)$readToTranscriptMap <- 
+            generateReadToTranscriptMap(readClassList, 
+                                        metadata(readClassList)$readClassDist, 
+                                        annotations)
+
+    return(quantData)     
+
+}
+
+#' Generate unique counts
+#' @noRd
+generateUniqueCounts <- function(readClassDt, countMatrix, annotations){
+    x <- readClassDt %>% filter(!multi_align & !is.na(eqClass.match))
+    uniqueCounts <- countMatrix[x$eqClass.match,]
+    uniqueCounts.tx <- sparse.model.matrix(~ factor(x$txid) - 1)
+    uniqueCounts <- t(uniqueCounts.tx) %*% uniqueCounts
+    rownames(uniqueCounts) <- names(annotations)[match(as.numeric(levels(factor(x$txid))),mcols(annotations)$txid)]
+    counts <- sparseMatrix(length(annotations), ncol(uniqueCounts), x = 0)
+    rownames(counts) <- names(annotations)
+    counts[rownames(uniqueCounts),] <- uniqueCounts
+    return(counts)
+    
+    # these three lines appear after return, so it's not used, is this used for debug only?
+    # counts.total = colSums(countMatrix) + colSums(incompatibleCountMatrix)
+    # counts.total[counts.total==0] = 1
+    # counts.CPM = counts/counts.total * 10^6
+
+}
+
+
+#' Generate incompatible counts
+#' @noRd
+generateIncompatibleCounts <- function(incompatibleCountMatrix, annotations){
+    genes <- levels(factor(unique(mcols(annotations)$GENEID)))
+    rownames(incompatibleCountMatrix) <- genes[as.numeric(rownames(incompatibleCountMatrix))]
+    geneMat <- sparseMatrix(length(genes), ncol(incompatibleCountMatrix), x = 0)
+    rownames(geneMat) <- genes
+    geneMat[rownames(incompatibleCountMatrix),] <- incompatibleCountMatrix
+    return(geneMat)
+}
+
+
+#' Generate non-unique counts
+#' @noRd
+generateNonUniqueCounts <- function(readClassDt, countMatrix, annotations){
+    #fuse multi align RCs by gene
+    x <- readClassDt %>% filter(multi_align & !is.na(eqClass.match))
+    x <- x %>% distinct(eqClassId, .keep_all = TRUE)
+    nonuniqueCounts <- countMatrix[x$eqClass.match,, drop = FALSE]
+    if(nrow(x)>1 & length(unique(x$gene_sid))>1){
+        nonuniqueCounts.gene <- sparse.model.matrix(~ factor(x$gene_sid) - 1)
+        nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
+    } else{
+        warning("The factor variable 'gene_sid' has only one level. Adjusting output.")
+        nonuniqueCounts.gene <- Matrix(1, nrow = nrow(x), ncol = 1, sparse = TRUE)
+        nonuniqueCounts <- t(nonuniqueCounts.gene) %*% nonuniqueCounts
+    }
+    #covert ids into gene ids
+    geneids <- as.numeric(levels(factor(x$gene_sid)))
+    geneids <- x$txid[match(geneids, x$gene_sid)]
+    geneids <- mcols(annotations)$GENEID[as.numeric(geneids)]
+    rownames(nonuniqueCounts) <- geneids
+    #create matrix for all annotated genes
+    genes <- levels(factor(unique(mcols(annotations)$GENEID)))
+    geneMat <- sparseMatrix(length(genes), ncol(nonuniqueCounts), x = 0)
+    rownames(geneMat) <- genes
+    if(!is.null(rownames(nonuniqueCounts))){
+      geneMat[rownames(nonuniqueCounts),] <- nonuniqueCounts
+    }
+    return(geneMat)
+}
@@ -19,6 +19,10 @@ isore.combineTranscriptCandidates <- function(readClassList,
         min.readCount, min.readFractionByGene, 
         min.txScore.multiExon, min.txScore.singleExon, verbose)
     combinedSplicedTranscripts[,confidenceType := "highConfidenceJunctionReads"]
+    # when single exon min score is greater than 1, skip unspliced transcripts combination
+    # this is a very customized config, useful when data is very big 
+    if (min.txScore.singleExon > 1) 
+        return(combinedSplicedTranscripts)
     combinedUnsplicedTranscripts <- 
         combineUnsplicedTranscriptModels(readClassList, bpParameters, 
         stranded, min.readCount, min.readFractionByGene, 
@@ -35,11 +39,11 @@ isore.combineTranscriptCandidates <- function(readClassList,
 combineSplicedTranscriptModels <- function(readClassList, bpParameters, 
         min.readCount, min.readFractionByGene, min.txScore.multiExon, 
         min.txScore.singleExon, verbose){
-    bpParameters$progressbar = FALSE
+    bpParameters$progressbar <- FALSE
     options(scipen = 999) #maintain numeric basepair locations not sci.notfi.
     start.ptm <- proc.time()
     n_sample <- length(readClassList)
-    nGroups = max(ceiling(n_sample/10),min(bpworkers(bpParameters), 
+    nGroups <- max(ceiling(n_sample/10),min(bpworkers(bpParameters), 
                                             round(n_sample/2)))
     indexList <- sample(rep(seq_len(nGroups), length.out=n_sample))
     indexList <- splitAsList(seq_len(n_sample), indexList)
@@ -128,7 +132,7 @@ combineFeatureTibble <- function(combinedFeatureTibble,
             maxTxScore.noFit, NSampleReadCount, NSampleReadProp,NSampleTxScore, 
             starts_with('start'), starts_with('end'), starts_with('readCount'))
     } else { 
-        combinedTable = full_join(combinedFeatureTibble, 
+        combinedTable <- full_join(combinedFeatureTibble, 
             featureTibbleSummarised, by = c('intronStarts', 'intronEnds', 'chr',
             'strand'), suffix=c('.combined','.new')) %>% 
             mutate(NSampleReadCount=pmax0NA(NSampleReadCount.combined) + 
@@ -208,7 +212,7 @@ combineUnsplicedTranscriptModels <-
             min.readFractionByGene, min.txScore.multiExon,
             min.txScore.singleExon, verbose){
         start.ptm <- proc.time()
-        bpParameters$progressbar = FALSE
+        bpParameters$progressbar <- FALSE
         newUnsplicedSeList <- 
             bplapply(seq_along(readClassList), function(sample_id)
                 extractNewUnsplicedRanges(readClassSe = 
@@ -285,7 +289,7 @@ reduceUnsplicedRanges <- function(rangesList, stranded){
 makeUnsplicedTibble <- function(combinedNewUnsplicedSe,newUnsplicedSeList,
         colDataNames,min.readCount, min.readFractionByGene,
         min.txScore.multiExon, min.txScore.singleExon, bpParameters){
-        bpParameters$progressbar = FALSE
+        bpParameters$progressbar <- FALSE
     newUnsplicedTibble <- as_tibble(combinedNewUnsplicedSe) %>%
         rename(chr = seqnames) %>% select(chr, start, end, strand, row_id) %>%
         separate_rows(row_id, sep = "\\+")