Skip to content

Commit 3f77830

Browse files
committed
--process_only_chr option
1 parent bbd57b6 commit 3f77830

File tree

2 files changed

+19
-6
lines changed

2 files changed

+19
-6
lines changed

isoquant.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,8 @@ def add_hidden_option(*args, **kwargs): # show command only with --full-help
105105
"e.g. with official annotations, such as GENCODE; "
106106
"speeds up gene database conversion")
107107
add_additional_option_to_group(ref_args_group, "--discard_chr", nargs="+", help="chromosome IDs to ignore", type=str, default=[])
108+
add_additional_option_to_group(ref_args_group, "--process_only_chr", nargs="+", help="chromosome IDs to process",
109+
type=str, default=None)
108110
add_additional_option_to_group(ref_args_group, "--index", help="genome index for specified aligner (optional)",
109111
type=str)
110112

@@ -448,6 +450,10 @@ def check_input_params(args):
448450

449451
if args.no_secondary:
450452
logger.info("--no_secondary option has no effect and will be deprecated, secondary alignments are not used by default")
453+
454+
if args.process_only_chr and args.discard_chr:
455+
args.discard_chr = []
456+
logger.warning("--discard_chr has not effect when --process_only_chr is set and will be ignored")
451457

452458
check_input_files(args)
453459
return True

src/dataset_processor.py

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -385,6 +385,8 @@ def process_sample(self, sample):
385385
logger.info("Experiment has " + proper_plural_form("BAM file", len(sample.file_list)) + ": " + ", ".join(
386386
map(lambda x: x[0], sample.file_list)))
387387
self.chr_ids = self.get_chromosome_ids(sample)
388+
logger.info("Total number of chromosomes to be processed %d: %s " %
389+
(len(self.chr_ids), ", ".join(map(lambda x: str(x), sorted(self.chr_ids)))))
388390
self.args.use_technical_replicas = self.args.read_group == "file_name" and len(sample.file_list) > 1
389391

390392
self.all_read_groups = set()
@@ -438,17 +440,23 @@ def process_sample(self, sample):
438440
os.remove(f)
439441
logger.info("Processed experiment " + sample.prefix)
440442

443+
def keep_only_defined_chromosomes(self, chr_set: set):
444+
if self.args.process_only_chr:
445+
chr_set.intersection_update(self.args.process_only_chr)
446+
elif self.args.discard_chr:
447+
chr_set.difference_update(self.args.discard_chr)
448+
449+
return chr_set
450+
441451
def get_chromosome_ids(self, sample):
442452
genome_chromosomes = set(self.reference_record_dict.keys())
443-
for chr_id in self.args.discard_chr:
444-
genome_chromosomes.discard(chr_id)
453+
genome_chromosomes = self.keep_only_defined_chromosomes(genome_chromosomes)
445454

446455
bam_chromosomes = set()
447456
for bam_file in list(map(lambda x: x[0], sample.file_list)):
448457
bam = pysam.AlignmentFile(bam_file, "rb", require_index=True)
449458
bam_chromosomes.update(bam.references)
450-
for chr_id in self.args.discard_chr:
451-
bam_chromosomes.discard(chr_id)
459+
bam_chromosomes = self.keep_only_defined_chromosomes(bam_chromosomes)
452460

453461
bam_genome_overlap = genome_chromosomes.intersection(bam_chromosomes)
454462
if len(bam_genome_overlap) != len(genome_chromosomes) or len(bam_genome_overlap) != len(bam_chromosomes):
@@ -472,8 +480,7 @@ def get_chromosome_ids(self, sample):
472480
gffutils_db = gffutils.FeatureDB(self.args.genedb)
473481
for feature in gffutils_db.all_features():
474482
gene_annotation_chromosomes.add(feature.seqid)
475-
for chr_id in self.args.discard_chr:
476-
gene_annotation_chromosomes.discard(chr_id)
483+
gene_annotation_chromosomes = self.keep_only_defined_chromosomes(gene_annotation_chromosomes)
477484

478485
common_overlap = gene_annotation_chromosomes.intersection(bam_genome_overlap)
479486
if len(common_overlap) != len(gene_annotation_chromosomes):

0 commit comments

Comments
 (0)