Skip to content

Commit acea376

Browse files
committed
use string pools inside counter to avoid duplication
1 parent 2146234 commit acea376

File tree

5 files changed

+140
-79
lines changed

5 files changed

+140
-79
lines changed

src/assignment_aggregator.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -30,9 +30,9 @@
3030

3131

3232
class ReadAssignmentAggregator:
33-
def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzipped=False, grouping_strategy_names=None):
33+
def __init__(self, args, sample, string_pools, gffutils_db=None, chr_id=None, gzipped=False, grouping_strategy_names=None):
3434
self.args = args
35-
self.read_groups = read_groups
35+
self.string_pools = string_pools
3636
self.grouping_strategy_names = grouping_strategy_names if grouping_strategy_names else ["default"]
3737
self.common_header = "# Command line: " + args._cmd_line + "\n# IsoQuant version: " + args._version + "\n"
3838
self.io_support = IOSupport(self.args)
@@ -95,8 +95,9 @@ def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzi
9595
if self.args.count_exons and self.args.genedb:
9696
exon_counts_path = sample.get_exon_counts_file(chr_id) if chr_id else sample.out_exon_counts_tsv
9797
intron_counts_path = sample.get_intron_counts_file(chr_id) if chr_id else sample.out_intron_counts_tsv
98-
self.exon_counter = ExonCounter(exon_counts_path, ignore_read_groups=True)
99-
self.intron_counter = IntronCounter(intron_counts_path, ignore_read_groups=True)
98+
# string_pools=None means ungrouped counting
99+
self.exon_counter = ExonCounter(exon_counts_path)
100+
self.intron_counter = IntronCounter(intron_counts_path)
100101
self.global_counter.add_counters([self.exon_counter, self.intron_counter])
101102

102103
if self.args.read_group and self.args.genedb:
@@ -112,12 +113,12 @@ def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzi
112113
gene_counter = create_gene_counter(gene_out_file,
113114
self.args.gene_quantification,
114115
complete_feature_list=self.gene_set,
115-
read_groups=self.read_groups[group_idx],
116+
string_pools=self.string_pools,
116117
group_index=group_idx)
117118
transcript_counter = create_transcript_counter(transcript_out_file,
118119
self.args.transcript_quantification,
119120
complete_feature_list=self.transcript_set,
120-
read_groups=self.read_groups[group_idx],
121+
string_pools=self.string_pools,
121122
group_index=group_idx)
122123

123124
self.global_counter.add_counters([gene_counter, transcript_counter])
@@ -129,8 +130,8 @@ def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzi
129130
else:
130131
exon_out_file = f"{sample.out_exon_grouped_counts_tsv}_{strategy_name}"
131132
intron_out_file = f"{sample.out_intron_grouped_counts_tsv}_{strategy_name}"
132-
exon_counter = ExonCounter(exon_out_file, group_index=group_idx)
133-
intron_counter = IntronCounter(intron_out_file, group_index=group_idx)
133+
exon_counter = ExonCounter(exon_out_file, string_pools=self.string_pools, group_index=group_idx)
134+
intron_counter = IntronCounter(intron_out_file, string_pools=self.string_pools, group_index=group_idx)
134135
self.global_counter.add_counters([exon_counter, intron_counter])
135136

136137
if self.args.read_group and not self.args.no_model_construction:
@@ -145,12 +146,12 @@ def __init__(self, args, sample, read_groups, gffutils_db=None, chr_id=None, gzi
145146
transcript_model_counter = create_transcript_counter(
146147
transcript_model_out_file,
147148
self.args.transcript_quantification,
148-
read_groups=self.read_groups[group_idx],
149+
string_pools=self.string_pools,
149150
group_index=group_idx)
150151
gene_model_counter = create_gene_counter(
151152
gene_model_out_file,
152153
self.args.gene_quantification,
153-
read_groups=self.read_groups[group_idx],
154+
string_pools=self.string_pools,
154155
group_index=group_idx)
155156

156157
self.transcript_model_global_counter.add_counter(transcript_model_counter)

src/dataset_processor.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
from .barcode_calling.umi_filtering import create_transcript_info_dict
3939
from .table_splitter import split_read_table_parallel
4040
from .assignment_aggregator import ReadAssignmentAggregator
41+
from .string_pools import setup_string_pools
4142
from .parallel_workers import (
4243
collect_reads_in_parallel,
4344
construct_models_in_parallel,
@@ -368,8 +369,15 @@ def process_assigned_reads(self, sample, dump_filename):
368369
logger.info("Transcript models construction is turned %s" %
369370
("off" if self.args.no_model_construction else "on"))
370371

372+
# Build string pools for the merge phase (to convert group names <-> IDs)
373+
# This must match the pools used by parallel workers
374+
string_pools = None
375+
if self.args.read_group:
376+
string_pools = setup_string_pools(self.args, sample, chr_ids, chr_id=None,
377+
load_barcode_pool=False, load_tsv_pools=False)
378+
371379
# set up aggregators and outputs
372-
aggregator = ReadAssignmentAggregator(self.args, sample, self.all_read_groups, gzipped=self.args.gzipped,
380+
aggregator = ReadAssignmentAggregator(self.args, sample, string_pools, gzipped=self.args.gzipped,
373381
grouping_strategy_names=self.grouping_strategy_names)
374382
transcript_stat_counter = EnumStats()
375383

src/graph_based_model_construction.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -162,22 +162,22 @@ def forward_counts(self, read_assignments):
162162
for read_assignment in self.transcript_read_ids[transcript_id]:
163163
read_id = read_assignment.read_id
164164
if self.read_assignment_counts[read_id] == 1:
165-
# Add to ungrouped counters
166-
self.transcript_counter.add_read_info_raw(read_id, [transcript_id], read_assignment.read_group)
167-
self.gene_counter.add_read_info_raw(read_id, [gene_id], read_assignment.read_group)
165+
# Add to ungrouped counters - use read_group_ids (integers) directly
166+
self.transcript_counter.add_read_info_raw(read_id, [transcript_id], read_assignment.read_group_ids)
167+
self.gene_counter.add_read_info_raw(read_id, [gene_id], read_assignment.read_group_ids)
168168
continue
169169

170170
if read_id not in ambiguous_assignments:
171-
ambiguous_assignments[read_id] = [read_assignment.read_group]
171+
ambiguous_assignments[read_id] = [read_assignment.read_group_ids]
172172
ambiguous_assignments[read_id].append(transcript_id)
173173

174174
for read_id in ambiguous_assignments.keys():
175-
read_groups = ambiguous_assignments[read_id][0]
175+
read_group_ids = ambiguous_assignments[read_id][0]
176176
transcript_ids = ambiguous_assignments[read_id][1:]
177177
gene_ids = [transcript2gene[transcript_id] for transcript_id in transcript_ids]
178-
# Add to ungrouped counters
179-
self.transcript_counter.add_read_info_raw(read_id, transcript_ids, read_groups)
180-
self.gene_counter.add_read_info_raw(read_id, gene_ids, read_groups)
178+
# Add to ungrouped counters - use read_group_ids (integers) directly
179+
self.transcript_counter.add_read_info_raw(read_id, transcript_ids, read_group_ids)
180+
self.gene_counter.add_read_info_raw(read_id, gene_ids, read_group_ids)
181181

182182
for r in read_assignments:
183183
if self.read_assignment_counts[r.read_id] > 0: continue

0 commit comments

Comments
 (0)