--process_only_chr option

andrewprzh · andrewprzh · commit 3f77830ba28d · 2025-10-20T22:49:37.000+03:00
diff --git a/isoquant.py b/isoquant.py
@@ -105,6 +105,8 @@ def add_hidden_option(*args, **kwargs):  # show command only with --full-help
                                      "e.g. with official annotations, such as GENCODE; "
                                      "speeds up gene database conversion")
     add_additional_option_to_group(ref_args_group, "--discard_chr", nargs="+", help="chromosome IDs to ignore", type=str, default=[])
+    add_additional_option_to_group(ref_args_group, "--process_only_chr", nargs="+", help="chromosome IDs to process",
+                                   type=str, default=None)
     add_additional_option_to_group(ref_args_group, "--index", help="genome index for specified aligner (optional)",
                                    type=str)
 
@@ -448,6 +450,10 @@ def check_input_params(args):
 
     if args.no_secondary:
         logger.info("--no_secondary option has no effect and will be deprecated, secondary alignments are not used by default")
+
+    if args.process_only_chr and args.discard_chr:
+        args.discard_chr = []
+        logger.warning("--discard_chr has not effect when --process_only_chr is set and will be ignored")
         
     check_input_files(args)
     return True
diff --git a/src/dataset_processor.py b/src/dataset_processor.py
@@ -385,6 +385,8 @@ def process_sample(self, sample):
         logger.info("Experiment has " + proper_plural_form("BAM file", len(sample.file_list)) + ": " + ", ".join(
             map(lambda x: x[0], sample.file_list)))
         self.chr_ids = self.get_chromosome_ids(sample)
+        logger.info("Total number of chromosomes to be processed %d: %s " %
+                    (len(self.chr_ids), ", ".join(map(lambda x: str(x), sorted(self.chr_ids)))))
         self.args.use_technical_replicas = self.args.read_group == "file_name" and len(sample.file_list) > 1
 
         self.all_read_groups = set()
@@ -438,17 +440,23 @@ def process_sample(self, sample):
                 os.remove(f)
         logger.info("Processed experiment " + sample.prefix)
 
+    def keep_only_defined_chromosomes(self, chr_set: set):
+        if self.args.process_only_chr:
+            chr_set.intersection_update(self.args.process_only_chr)
+        elif self.args.discard_chr:
+            chr_set.difference_update(self.args.discard_chr)
+
+        return chr_set
+
     def get_chromosome_ids(self, sample):
         genome_chromosomes = set(self.reference_record_dict.keys())
-        for chr_id in self.args.discard_chr:
-            genome_chromosomes.discard(chr_id)
+        genome_chromosomes = self.keep_only_defined_chromosomes(genome_chromosomes)
 
         bam_chromosomes = set()
         for bam_file in list(map(lambda x: x[0], sample.file_list)):
             bam = pysam.AlignmentFile(bam_file, "rb", require_index=True)
             bam_chromosomes.update(bam.references)
-        for chr_id in self.args.discard_chr:
-            bam_chromosomes.discard(chr_id)
+        bam_chromosomes = self.keep_only_defined_chromosomes(bam_chromosomes)
 
         bam_genome_overlap = genome_chromosomes.intersection(bam_chromosomes)
         if len(bam_genome_overlap) != len(genome_chromosomes) or len(bam_genome_overlap) != len(bam_chromosomes):
@@ -472,8 +480,7 @@ def get_chromosome_ids(self, sample):
         gffutils_db = gffutils.FeatureDB(self.args.genedb)
         for feature in gffutils_db.all_features():
             gene_annotation_chromosomes.add(feature.seqid)
-        for chr_id in self.args.discard_chr:
-            gene_annotation_chromosomes.discard(chr_id)
+        gene_annotation_chromosomes = self.keep_only_defined_chromosomes(gene_annotation_chromosomes)
 
         common_overlap = gene_annotation_chromosomes.intersection(bam_genome_overlap)
         if len(common_overlap) != len(gene_annotation_chromosomes):