@@ -40,7 +40,7 @@ import org.codehaus.groovy.runtime.ResourceGroovyMethods
4040import java.util.concurrent.ConcurrentHashMap
4141import java.util.concurrent.atomic.AtomicInteger
4242
43- import static com.milaboratory.migec.Util.BLANK_PATH
43+ import static com.milaboratory.migec.Util.*
4444
4545// ////////////
4646// CLI //
@@ -66,6 +66,8 @@ cli.N(args: 1,
6666 " Number of reads to take. For downsampling purposes" )
6767cli. _(longOpt : " no-sort" ,
6868 " Do not sort output by clonotype count, can save some time" )
69+ cli. _(longOpt : " allow-chimeric" ,
70+ " Allow V-J combinations coming from different genes" )
6971cli. _(longOpt : " blast-path" , args : 1 , argName : " directory" ,
7072 " Path to blast executable." )
7173cli. _(longOpt : " same-sample" , " Assembled data only (-a). Input files come from the same sample so " +
@@ -110,10 +112,11 @@ if (opt.h) {
110112}
111113
112114// SEGMENTS STUFF
113- boolean includeNonFuncitonal = opt. ' all-segments' , includeAlleles = opt. ' all-alleles'
115+ boolean includeNonFuncitonal = opt. ' all-segments' , includeAlleles = opt. ' all-alleles' ,
116+ allowChimeric = opt. ' allow-chimeric'
114117if (opt. ' print-library' ) {
115118 println " CDR3 extraction is possible for the following data (segments include non-functional = $includeNonFuncitonal ):"
116- Util . listAvailableSegments(includeNonFuncitonal, includeAlleles)
119+ listAvailableSegments(includeNonFuncitonal, includeAlleles)
117120 System . exit(0 )
118121}
119122
@@ -132,14 +135,15 @@ if (!chainOpt) {
132135
133136def chains = chainOpt. split(" ," )
134137
135- chains. each { chain ->
136- if (! Util . isAvailable(species, chain, includeNonFuncitonal, includeAlleles)) {
138+ def hasD = chains. collect { chain ->
139+ if (! isAvailable(species, chain, includeNonFuncitonal, includeAlleles)) {
137140 println " [ERROR] Sorry, no analysis could be performed for $species gene $chain " +
138141 " (include non-functional = $includeNonFuncitonal ). " +
139142 " Possible variants are:\n "
140- Util . listAvailableSegments(includeNonFuncitonal, includeAlleles)
143+ listAvailableSegments(includeNonFuncitonal, includeAlleles)
141144 System . exit(2 )
142145 }
146+ hasD(species, chain, includeNonFuncitonal, includeAlleles)
143147}
144148
145149// SYSTEM
@@ -225,10 +229,10 @@ def vAlleles = new ArrayList<Allele>(), jAlleles = new ArrayList<Allele>(),
225229 dAlleles = new ArrayList<Allele > ()
226230def collapseAlleleMap = new HashMap<String , Allele > ()
227231
228- chains. each { chain ->
232+ chains. eachWithIndex { String chain , int ind ->
229233 println " ${ timestamp()} Loading ${ chain} Variable and Joining segment data"
230234
231- def resFile = Util . getSegmentsFile(includeNonFuncitonal, includeAlleles)
235+ def resFile = getSegmentsFile(includeNonFuncitonal, includeAlleles)
232236
233237 resFile. splitEachLine(" \t " ) {
234238 if (species. toUpperCase() == it[0 ]. toUpperCase() &&
@@ -243,7 +247,7 @@ chains.each { chain ->
243247 // note that D segment could have different strand
244248 dAlleles. add(new Allele (alleleId : it[3 ],
245249 segmentId : it[3 ]. split(" [*]" )[0 ],
246- seq : Util . revCompl(seq)))
250+ seq : revCompl(seq)))
247251 } else {
248252 // More complex processing for V/J alleles
249253 // Mask all except seed region
@@ -272,7 +276,9 @@ chains.each { chain ->
272276 allele = new Allele (alleleId : it[3 ], segmentId : it[3 ]. split(" [*]" )[0 ],
273277 refPoint : refPoint,
274278 seq : seq,
275- type : type)
279+ type : type,
280+ chain : chain,
281+ hasD : hasD[ind])
276282 collapseAlleleMap. put(seq, allele)
277283 } else
278284 allele. alleleId + = " ," + it[3 ]
@@ -331,7 +337,7 @@ def seqMap = new HashMap<String, Integer>(),
331337int counter = 0 , uniqueCounter = 0
332338inputFileNames. each { inputFileName ->
333339 println " ${ timestamp()} Reading $inputFileName and generating list of unique sequences to map V and J genes"
334- def reader = Util . getReader(inputFileName)
340+ def reader = getReader(inputFileName)
335341 String header
336342 while (((header = reader. readLine()) != null ) && (nReads < 0 || counter < nReads)) {
337343 if (assembledInput) {
@@ -360,7 +366,7 @@ inputFileNames.each { inputFileName ->
360366 }
361367}
362368
363- def queryFilePrefix = TMP_FOLDER + " /" + Util . getFastqPrefix(inputFileNames[0 ])
369+ def queryFilePrefix = TMP_FOLDER + " /" + getFastqPrefix(inputFileNames[0 ])
364370println " [${ new Date()} $SCRIPT_NAME ] Making a temporary FASTA file for BLAST queries"
365371int chunk_sz = seqList. size() / THREADS
366372for (int p = 0 ; p < THREADS ; p++ ) { // split fasta for blast parallelizaiton
@@ -454,7 +460,8 @@ GParsPool.withPool THREADS, {
454460 String seq = entry. key
455461 int id = entry. value
456462 def vMapping = vMappingData[id], jMapping = jMappingData[id]
457- if (vMapping != null && jMapping != null ) {
463+ if (vMapping != null && jMapping != null &&
464+ (allowChimeric || vMapping. allele. chain == jMapping. allele. chain)) {
458465 Allele vAllele = vMapping. allele,
459466 jAllele = jMapping. allele
460467 boolean vRC = vMapping. aFrom > vMapping. aTo, jRC = jMapping. aFrom > jMapping. aTo
@@ -485,7 +492,7 @@ GParsPool.withPool THREADS, {
485492 if (vRefInAlign && jRefInAlign && vRC == jRC) {
486493 if (vRC) { // after that step RC is same to non-RC
487494 reverseFound. get()
488- seq = Util . revCompl(seq)
495+ seq = revCompl(seq)
489496
490497 int temp = vMapping. qTo
491498 vMapping. qTo = seq. length() - vMapping. qFrom - 1
@@ -553,16 +560,16 @@ GParsPool.withPool THREADS, {
553560
554561 if (DEBUG ) {
555562 if (bad && badExamples. get() < DEBUG_MAX_MESSAGES ) {
556- def message = ((vRC == jRC && vRC) ? Util . revCompl(seq) : seq) + " \n "
563+ def message = ((vRC == jRC && vRC) ? revCompl(seq) : seq) + " \n "
557564
558565 if (! vRefInAlign)
559566 message + = vAllele. alleleId + " \t " +
560- (vRC ? Util . revCompl(vAllele. seq. toUpperCase()) : vAllele. seq. toUpperCase()) + " \t " +
567+ (vRC ? revCompl(vAllele. seq. toUpperCase()) : vAllele. seq. toUpperCase()) + " \t " +
561568 " aFrom=$vMapping . aFrom aTo=$vMapping . aTo ref=$vRef \n "
562569
563570 if (! jRefInAlign)
564571 message + = jAllele. alleleId + " \t " +
565- (vRC ? Util . revCompl(jAllele. seq. toUpperCase()) : jAllele. seq. toUpperCase()) + " \t " +
572+ (vRC ? revCompl(jAllele. seq. toUpperCase()) : jAllele. seq. toUpperCase()) + " \t " +
566573 " aFrom=$jMapping . aFrom aTo=$jMapping . aTo ref=$jRef \n "
567574
568575 debugMessagesBad. add(message)
@@ -577,7 +584,7 @@ GParsPool.withPool THREADS, {
577584 vAllele. seq. toUpperCase() + " \t " +
578585 " qTo=$vMapping . qTo qFrom=$vMapping . qFrom aTo=$vMapping . aTo aFrom=$vMapping . aFrom ref=$vRef \n "
579586
580- message + = jAllele. alleleId + " \t " + Util . revCompl(jAllele. seq. toUpperCase()) + " \t " +
587+ message + = jAllele. alleleId + " \t " + revCompl(jAllele. seq. toUpperCase()) + " \t " +
581588 " qTo=$jMapping . qTo qFrom=$jMapping . qFrom aTo=$jMapping . aTo aFrom=$jMapping . aFrom ref=$jRef \n "
582589
583590 debugMessagesGood. add(message)
@@ -624,9 +631,9 @@ if (!dAlleles.empty) {
624631// //////////////////////////////
625632counter = 0
626633int goodReads = 0 , goodEvents = 0 , mappedReads = 0 , mappedEvents = 0 , totalReads = 0 , totalEvents = 0
627- def writerCdr3Fastq = cdr3FastqFile ? Util . getWriter(cdr3FastqFile) : null ,
628- writerUnmapped3Fastq = unmappedFastqFile ? Util . getWriter(unmappedFastqFile) : null ,
629- writerCdr3Umi = assembledInput && cdr3UmiTable ? Util . getWriter(cdr3UmiTable) : null
634+ def writerCdr3Fastq = cdr3FastqFile ? getWriter(cdr3FastqFile) : null ,
635+ writerUnmapped3Fastq = unmappedFastqFile ? getWriter(unmappedFastqFile) : null ,
636+ writerCdr3Umi = assembledInput && cdr3UmiTable ? getWriter(cdr3UmiTable) : null
630637
631638if (writerCdr3Umi)
632639 writerCdr3Umi. println (" #umi\t mig_sz\t cdr3nt\t v\t j\t d" )
@@ -635,7 +642,7 @@ def usedUmis = new HashSet<String>()
635642
636643inputFileNames. each { inputFileName ->
637644 println " ${ timestamp()} Appending read data from $inputFileName "
638- def reader = Util . getReader(inputFileName)
645+ def reader = getReader(inputFileName)
639646 while (((header = reader. readLine()) != null ) && (nReads < 0 || counter < nReads)) {
640647 def seq = reader. readLine()
641648 reader. readLine()
@@ -662,7 +669,7 @@ inputFileNames.each { inputFileName ->
662669 qual = qual. substring(clonotypeData. cdrFrom, clonotypeData. cdrTo)
663670
664671 // Increment counters if quality is good
665- if (Util . minQual(qual) >= qualThreshold) {
672+ if (minQual(qual) >= qualThreshold) {
666673 if (assembledInput && sameSample) {
667674 def umiCdr = umi + " _" + clonotypeData. cdr3Seq
668675 duplicateUmi = usedUmis. contains(umi)
@@ -812,7 +819,7 @@ if (logFileName) {
812819 } else {
813820 logFile. absoluteFile. parentFile. mkdirs()
814821 logFile. withPrintWriter { pw ->
815- pw. println (Util . CDRBLAST_LOG_HEADER )
822+ pw. println (CDRBLAST_LOG_HEADER )
816823 }
817824 }
818825
0 commit comments