Skip to content

Commit 7f3410d

Browse files
committed
Skip sam metadata if we have too many references
This can cause OOM issues.
1 parent 4ec58ae commit 7f3410d

File tree

1 file changed

+17
-12
lines changed

1 file changed

+17
-12
lines changed

lib/galaxy/datatypes/binary.py

Lines changed: 17 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -552,18 +552,23 @@ class _BamOrSam:
552552
Helper class to set the metadata common to sam and bam files
553553
"""
554554

555+
max_references = 100000
556+
555557
def set_meta(self, dataset: DatasetProtocol, overwrite: bool = True, **kwd) -> None:
556558
try:
557-
bam_file = pysam.AlignmentFile(dataset.get_file_name(), mode="rb")
558-
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
559-
dataset.metadata.reference_names = list(bam_file.references)
560-
dataset.metadata.reference_lengths = list(bam_file.lengths)
561-
dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined]
562-
dataset.metadata.read_groups = [
563-
read_group["ID"] for read_group in dataset.metadata.bam_header.get("RG", []) if "ID" in read_group
564-
]
565-
dataset.metadata.sort_order = dataset.metadata.bam_header.get("HD", {}).get("SO", None)
566-
dataset.metadata.bam_version = dataset.metadata.bam_header.get("HD", {}).get("VN", None)
559+
with pysam.AlignmentFile(dataset.get_file_name(), mode="rb", check_sq=False) as bam_file:
560+
# TODO: Reference names, lengths, read_groups and headers can become very large, truncate when necessary
561+
if bam_file.nreferences <= self.max_references:
562+
dataset.metadata.reference_names = list(bam_file.references)
563+
dataset.metadata.reference_lengths = list(bam_file.lengths)
564+
dataset.metadata.bam_header = dict(bam_file.header.items()) # type: ignore [attr-defined]
565+
dataset.metadata.read_groups = [
566+
read_group["ID"]
567+
for read_group in dataset.metadata.bam_header.get("RG", [])
568+
if "ID" in read_group
569+
]
570+
dataset.metadata.sort_order = bam_file.header.get("HD", {}).get("SO", None) # type: ignore [attr-defined]
571+
dataset.metadata.bam_version = bam_file.header.get("HD", {}).get("VN", None) # type: ignore [attr-defined]
567572
except Exception:
568573
# Per Dan, don't log here because doing so will cause datasets that
569574
# fail metadata to end in the error state
@@ -1054,7 +1059,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
10541059
"""
10551060
# The best way to ensure that BAM files are coordinate-sorted and indexable
10561061
# is to actually index them.
1057-
with pysam.AlignmentFile(filename=file_name) as f:
1062+
with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
10581063
# The only sure thing we know here is that the sort order can't be coordinate
10591064
return f.header.get("HD", {}).get("SO") == "coordinate" # type: ignore[attr-defined]
10601065

@@ -1074,7 +1079,7 @@ def dataset_content_needs_grooming(self, file_name: str) -> bool:
10741079
"""
10751080
# The best way to ensure that BAM files are coordinate-sorted and indexable
10761081
# is to actually index them.
1077-
with pysam.AlignmentFile(filename=file_name) as f:
1082+
with pysam.AlignmentFile(filename=file_name, check_sq=False) as f:
10781083
return f.header.get("HD", {}).get("SO") != "queryname" # type: ignore[attr-defined]
10791084

10801085

0 commit comments

Comments
 (0)